In [38]:
import pandas as pd
import tensorflow as tf
import numpy as np


In [39]:
df = pd.read_excel('Data.xlsx')

In [42]:
df.head()

Unnamed: 0,event_date,Date,Weekday,session_id,User_Id,event_name,Channel,Device,Item_Category,revenue,Made_Purchase,item_quantity,Quantity
0,20221031,2022-10-31,Monday,1667222766,1.180241e+32,session_start,Paid Search,mobile,Other,0,0,0,0.0
1,20221031,2022-10-31,Monday,1667211122,6.940301e+32,session_start,Paid Search,mobile,Other,0,0,0,0.0
2,20221031,2022-10-31,Monday,1667265090,1.708097e+33,session_start,Paid Search,mobile,Other,0,0,0,0.0
3,20221031,2022-10-31,Monday,1667225629,3.2594460000000005e+31,session_start,Paid Search,mobile,Other,0,0,0,0.0
4,20221031,2022-10-31,Monday,1667238370,4.414687e+32,session_start,Paid Search,mobile,Other,0,0,0,0.0


In [43]:
df['Item_Category'].fillna('Other', inplace=True)
df['revenue'].fillna(0, inplace=True)

df['Quantity'].fillna(0, inplace=True)

In [44]:
df['Item_Category']=df['Item_Category'].astype(str)
df['revenue']=df['revenue'].replace({'\$':'',',':''},regex=True).astype(float)

In [45]:
print(df.dtypes)

event_date                int64
Date             datetime64[ns]
Weekday                  object
session_id                int64
User_Id                 float64
event_name               object
Channel                  object
Device                   object
Item_Category            object
revenue                 float64
Made_Purchase             int64
item_quantity             int64
Quantity                float64
dtype: object


In [46]:
grouped_df = df.groupby('User_Id').agg(
    channel_counts=('Channel', lambda x: x.value_counts().to_dict()),
    device_counts=('Device', lambda x: x.value_counts().to_dict()),
    total_revenue=('revenue', 'sum'),
    total_purchases=('Made_Purchase', 'max')
).reset_index()

In [47]:
one_hot_encoded = pd.get_dummies(df[['User_Id', 'Item_Category']], columns=['Item_Category'])

In [48]:
one_hot_encoded_grouped = one_hot_encoded.groupby('User_Id').sum().reset_index()
grouped_df = grouped_df.merge(one_hot_encoded_grouped, on='User_Id', how='left')

In [49]:
from pandas import json_normalize

channel_df = json_normalize(grouped_df['channel_counts'])
grouped_df = grouped_df.drop('channel_counts', axis=1).join(channel_df.fillna(0))

device_df = json_normalize(grouped_df['device_counts'])
grouped_df = grouped_df.drop('device_counts', axis=1).join(device_df.fillna(0))


grouped_df = grouped_df.astype(float)

In [53]:
grouped_df.to_excel('Correlation_data_two.xlsx')

In [54]:
#classification model
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


X_class = grouped_df.drop(['total_revenue', 'total_purchases', 'User_Id'], axis=1)
y_class = grouped_df['total_purchases'] 

X_train, X_test, y_train, y_test = train_test_split(X_class, y_class, test_size=0.3, random_state=42)

In [55]:
smote = SMOTE(random_state=42)
X_class_resampled, y_class_resampled = smote.fit_resample(X_train, y_train)

In [61]:
from tensorflow.keras.models import Sequential # type: ignore
from tensorflow.keras.layers import Dense # type: ignore


model_class = Sequential([
    Dense(64, activation='relu', input_shape=(X_class_resampled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='relu')
])


model_class.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


model_class.fit(X_class_resampled, y_class_resampled, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x23e845dd190>

In [68]:
model_class.save(r'C:\Users\Bozo\Desktop\Scandiweb_ML\Classification_model.h5')

In [65]:
model_regression = Sequential([
    Dense(64, activation='relu', input_shape=(X_class_resampled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1)
])

model_regression.compile(optimizer='adam', loss='mean_squared_error')


model_regression.fit(X_class_resampled, y_class_resampled, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x23e8efde090>

In [69]:
model_regression.save(r'C:\Users\Bozo\Desktop\Scandiweb_ML\Regression_model.h5')