In [None]:
import numpy as np
import pandas as pd
from ydata_profiling import ProfileReport

In [None]:
_df = pd.read_csv('datasets/Train.csv')

In [None]:
_df.head(15)

In [None]:
df = pd.read_csv('datasets/Train.csv')

In [None]:
profile = ProfileReport(df, title="Profiling Report")

In [None]:
profile.to_file("profiling_report.html")

In [None]:
# df columns
print(df.columns)

In [None]:
# nulls in df columns
print(df.isnull().sum())

In [None]:
# ignore columns call_type	call_status	start_time	duration	end_time	display_location	

df = df.drop(columns=['call_type', 'call_status', 'start_time', 'duration', 'end_time', 'display_location'])

In [None]:
df['conversions_calls'] = df['conversions_calls'].interpolate().fillna(df['conversions_calls'].interpolate().mean())

In [None]:
# fill nulls with 0
df = df.fillna(0)

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
y = df['clicks']
X = df.drop(columns=['ID', 'clicks'])

In [None]:
X['date'] = pd.to_datetime(X['date'])
X['year'] = X['date'].dt.year
X['month'] = X['date'].dt.month
X['day'] = X['date'].dt.day
X = X.drop(['date'], axis=1)

In [None]:
X['currency'] = X['currency'].astype(str)
X['ad_type'] = X['ad_type'].astype(str)

In [None]:
# encode currency, ad_type
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

X['currency'] = le.fit_transform(X['currency'])
X['ad_type'] = le.fit_transform(X['ad_type'])


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
# evaluate model
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(y_test, y_pred)

print(mse)

In [None]:
# predict on the entire dataset

x_pred = model.predict(X)
x_pred = x_pred.astype(int)

In [None]:
len(_df), len(X), len(x_pred)

In [None]:
df.tail()

In [None]:
def create_submission_id(df):
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['submission_id'] = df['ID'].astype(str) + '_' + df['year'].astype(str) + '_' + df['month'].astype(str) + '_' + df['day'].astype(str)
    return df

# Apply the function to your DataFrame
df = create_submission_id(df)

In [None]:
submission_df = pd.DataFrame()
submission_df['ID'] = df['ID']
submission_df['date'] = df['date']
submission_df['clicks'] = x_pred

In [None]:
submission_df.to_csv("random_forest_regressor_.csv", index=False)

In [None]:
def generate_weekly_clicks_df(df):
    # Parse the 'date' column to datetime format
    df['date'] = pd.to_datetime(df['date'])

    # Group the data by 'ID' and iterate over each group
    rows = []
    for id, group in df.groupby('ID'):
        # Get the maximum date for the ID
        max_date = group['date'].max()

        # Find the previous Monday of the week for the maximum date
        prev_monday = max_date - pd.Timedelta(days=(max_date.dayofweek + 7) % 7)

        # Calculate the last date of the first week (Sunday)
        first_week_end = prev_monday + pd.Timedelta(days=6)

        # Calculate the last date of the second week (Sunday)
        second_week_end = first_week_end + pd.Timedelta(days=7)

        # Create rows for the end dates of the first and second weeks
        first_week_row = {'ID': f"{id}_{first_week_end.year}_{first_week_end.month:02d}_{first_week_end.day:02d}", 'clicks': 0}
        second_week_row = {'ID': f"{id}_{second_week_end.year}_{second_week_end.month:02d}_{second_week_end.day:02d}", 'clicks': 0}

        # Append the rows to the list
        rows.append(first_week_row)
        rows.append(second_week_row)

    # Create a DataFrame from the list of rows
    result_df = pd.DataFrame(rows)

    return result_df

# Example usage:
# Assuming 'df' is your dataframe containing the advertising data
# weekly_clicks_df = generate_weekly_clicks_df(df)


In [None]:

c_df = generate_weekly_clicks_df(submission_df)

In [None]:
len(c_df)

In [None]:
c_df.to_csv("grouped_dates.csv", index=False)