In [None]:
!git clone https://github.com/AidaLog/admob-forecast.git

In [None]:
!cd ./admob-forecast

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('datasets/Train.csv')

# ignore columns call_type	call_status	start_time	duration	end_time	display_location

df = df.drop(columns=['call_type', 'call_status', 'start_time', 'duration', 'end_time', 'display_location'])
df['conversions_calls'] = df['conversions_calls'].interpolate().fillna(df['conversions_calls'].interpolate().mean())


# Step 1: Identify numeric columns
numeric_columns = df.select_dtypes(include=['float64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].mean())
df = df.dropna()


In [None]:
df.info()

In [None]:
df['currency'].unique(), df['currency'].nunique()

In [None]:
# Example exchange rates for ZAR and USD
exchange_rates = {'USD': 1.0, 'ZAR': 0.071}  # 1 USD = 0.071 ZAR

# Function to convert cost to USD
def convert_to_usd(currency, cost):
    exchange_rate = exchange_rates.get(currency, 1.0)  # Default to 1.0 if currency not found
    return cost * exchange_rate

# Update 'cost' column with cost in USD
df['cost'] = df.apply(lambda row: convert_to_usd(row['currency'], row['cost']), axis=1)

# leaving currency column to determine locality


In [None]:
df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

In [None]:
# display rows with ID ID_5e43c29e6279884e2827d894

df.loc[df['ID'] == 'ID_5e43c29e6279884e2827d894']['ad_type'].unique()

In [None]:
# encode currency, ad_type
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['ad_type'] = le.fit_transform(df['ad_type'])
df['currency'] = le.fit_transform(df['currency'])

df.head()

In [None]:
y = df['clicks']
X = df.drop(columns=['ID', 'clicks'])

X = X.drop(['date'], axis=1)

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
_prepd_df = pd.concat([X, y], axis=1)

In [None]:
# plot correlation matrix
import seaborn as sns

corr = _prepd_df.corr()

sns.heatmap(corr, annot=True)

In [None]:
# from corr matrix, show features correlation with clicks

corr['clicks'].sort_values(ascending=False)


In [None]:
# using the features with correlation > 0.1 from corr['clicks']

corr_features = corr['clicks'][corr['clicks'] > 0].index.drop('clicks')

In [None]:
%%capture
x = X[corr_features]
# y = df['clicks']

# # add date features for time series
x['year'] = X['year']
x['month'] = X['month']
x['day'] = X['day']

In [None]:
x.head()

In [None]:
# length check
len(x) == len(y)

In [None]:
# train test split
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

# models

In [None]:
# import xgboost as xgb
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import LinearRegression, ElasticNet, BayesianRidge
# # from sklearn.svm import SVR
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn.neural_network import MLPRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import mean_squared_error
# from math import sqrt

# # Initialize and train models
# models = {
#     # "XGBoost": xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, learning_rate=0.1, max_depth=3),
#     "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
#     # "LinearRegression": LinearRegression(),
#     # "ElasticNet": ElasticNet(),
#     # "BayesianRidge": BayesianRidge(),
#     # "KNeighborsRegressor": KNeighborsRegressor(),
#     # "DecisionTreeRegressor": DecisionTreeRegressor(),
#     # "MLPRegressor": MLPRegressor()
# }

# # Train and evaluate models
# results = {}
# for name, model in models.items():
#     print("Trainig ", name)
#     try:
#         model.fit(x_train, y_train)
#         predictions = model.predict(x_test)
#         mse = mean_squared_error(y_test, predictions)
#         results[name] = mse
#     except Exception as e:
#         print(f"Error occurred while training {name}: {e}")
#         results[name] = None  # Set MSE to None if an error occurs



# # Print results
# for name, mse in results.items():
#     if mse is not None:
#         rmse = sqrt(mse)
#         print(f"{name} |--| Mean Squared Error: {mse}, |--| Root Mean Squared Error: {rmse}")
#     else:
#         print(f"{name} failed to train.")



# neural Net

In [None]:
# %%capture
# !pip install tensorflow

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Normalization
from tensorflow.keras.layers import Dropout

In [None]:
model = Sequential([
    Dense(64, activation='relu',input_shape=(x_train.shape[1], )),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='mean_squared_error')


In [None]:
from keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(x_train, y_train, batch_size=32, epochs=50, validation_split=0.2, verbose=1, callbacks=early_stop)

In [None]:
import matplotlib.pyplot as plt

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()


# Submision file Generation / inference


In [None]:
# model_choice = "RandomForest"
# model = models[model_choice]

In [None]:
y_pred.shape

In [None]:
# model evaluation
from sklearn.metrics import mean_squared_error

# Evaluate model
y_pred = model.predict(x_test)
y_pred = y_pred.flatten()

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')
print(f'MSE: {mse}')

In [None]:
def extract_id_and_date(string):
    parts = string.split('_')
    id_str = parts[1]
    date_str = '_'.join(parts[2:])
    return "ID_" + id_str, pd.to_datetime(date_str, format='%Y_%m_%d')


In [None]:
# submission file
test_df = pd.read_csv('datasets/SampleSubmission.csv')

In [None]:
test_df['id'], test_df['date'] = zip(*test_df['ID'].map(extract_id_and_date))
test_df['end_date'] = test_df['date'] + pd.Timedelta(days=20)

In [None]:
test_df.head()

In [None]:
test_row = test_df.iloc[0]
test_row['id']

In [None]:

XX = X.copy()

In [None]:
XX['date'] = pd.to_datetime(df['date'])

In [None]:
XX['ID'] = df['ID']

In [None]:
XX.head(3)


In [None]:
def filter_df_by_id_or_date_range(df, id, start_date, end_date, sample=False):
    """
    Filter DataFrame by ID or date range.

    Args:
    df (DataFrame): Input DataFrame to filter.
    id (str): ID to filter.
    start_date (str or Timestamp): Start date of the range.
    end_date (str or Timestamp): End date of the range.

    Returns:
    DataFrame: Filtered DataFrame containing rows within the specified ID or date range.
    """
    filtered = df[(( df['ID'].str.contains('id') ) & (df['date'] >= start_date)) | ((df['ID'] == id) & (df['date'] <= end_date))]
    if sample: return filtered
    return filtered[x.columns]

In [None]:
# Iterate over each row in test_df
for index, test_row in test_df.iterrows():
    # Filter rows in XX based on the ID or date range from the current row
    filtered_df = filter_df_by_id_or_date_range(XX, test_row['id'], test_row['date'], test_row['end_date'])

    # Predict clicks for the filtered data
    clicks_predict = model.predict(filtered_df)
    clicks_predict = clicks_predict.flatten() + rmse
    # Calculate sum of clicks and update the 'clicks' column in test_df
    test_df.at[index, 'clicks'] = int(np.mean(clicks_predict))



In [None]:
filtered_df.head()

In [None]:
# count clicks with 0s
test_df[test_df['clicks'] == 0].shape[0]

In [None]:
test_df.head()

In [None]:
test_df.to_csv("100ep_nn_attempt.csv", index=False)