In [20]:
import pandas as pd
import sys
import os

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression 

# Admin Libraries
sys.path.append(os.path.abspath(".."))
from utils import helper_functions as hf

In [21]:
print(tf.__version__)

2.16.2


In [22]:
filepath = '../data/events_ml.csv' 
df = pd.read_csv(filepath) 
df.head()

Unnamed: 0,product_id,category_id,category_code,brand,last_view_before_cart,already_in_cart,time_of_day,day_of_week,month,price_range,last_view_before_purchase
0,4101974,2144415939364389423,electronics.clocks,honor,0,0,11:00:00,Thursday,2020-10,50-100,0
1,3506650,2144415935673401802,Unknown,kester,0,0,06:00:00,Tuesday,2020-10,10-50,0
2,124883,2144415924424278172,electronics.audio.acoustic,logitech,0,0,08:00:00,Monday,2020-11,10-50,0
3,125325,2144415924424278172,electronics.audio.acoustic,logitech,0,0,09:00:00,Monday,2020-11,10-50,0
4,254763,2144415924424278172,electronics.audio.acoustic,creative,0,0,10:00:00,Wednesday,2020-11,50-100,0


In [23]:
#convert all integer columns to object
int_columns = df.select_dtypes(include='int64').columns
df[int_columns] = df[int_columns].astype('object')
df.dtypes

product_id                   object
category_id                  object
category_code                object
brand                        object
last_view_before_cart        object
already_in_cart              object
time_of_day                  object
day_of_week                  object
month                        object
price_range                  object
last_view_before_purchase    object
dtype: object

### TO DO
- analyse category_code as it could be split
- transform lesser often product_ids into "other"
- downsampling with TomekLinks
- take out "last_view_before_cart"

In [25]:
df.head()

Unnamed: 0,product_id,category_id,category_code,brand,last_view_before_cart,already_in_cart,time_of_day,day_of_week,month,price_range,last_view_before_purchase
0,4101974,2144415939364389423,electronics.clocks,honor,0,0,11:00:00,Thursday,2020-10,50-100,0
1,3506650,2144415935673401802,Unknown,kester,0,0,06:00:00,Tuesday,2020-10,10-50,0
2,124883,2144415924424278172,electronics.audio.acoustic,logitech,0,0,08:00:00,Monday,2020-11,10-50,0
3,125325,2144415924424278172,electronics.audio.acoustic,logitech,0,0,09:00:00,Monday,2020-11,10-50,0
4,254763,2144415924424278172,electronics.audio.acoustic,creative,0,0,10:00:00,Wednesday,2020-11,50-100,0


# Create X and y, train and test set

In [26]:
y = df.pop('last_view_before_purchase').astype('int')
X = df

In [27]:
X

Unnamed: 0,product_id,category_id,category_code,brand,last_view_before_cart,already_in_cart,time_of_day,day_of_week,month,price_range
0,4101974,2144415939364389423,electronics.clocks,honor,0,0,11:00:00,Thursday,2020-10,50-100
1,3506650,2144415935673401802,Unknown,kester,0,0,06:00:00,Tuesday,2020-10,10-50
2,124883,2144415924424278172,electronics.audio.acoustic,logitech,0,0,08:00:00,Monday,2020-11,10-50
3,125325,2144415924424278172,electronics.audio.acoustic,logitech,0,0,09:00:00,Monday,2020-11,10-50
4,254763,2144415924424278172,electronics.audio.acoustic,creative,0,0,10:00:00,Wednesday,2020-11,50-100
...,...,...,...,...,...,...,...,...,...,...
884469,3829355,2144415922528452715,electronics.telephone,Unknown,0,0,23:00:00,Sunday,2021-02,10-50
884470,953226,2144415927553229037,Unknown,Unknown,0,0,23:00:00,Sunday,2021-02,200-500
884471,1715907,2144415927049912542,electronics.video.tv,starwind,0,0,23:00:00,Sunday,2021-02,50-100
884472,4170534,2144415939364389423,electronics.clocks,amazfit,0,0,23:00:00,Sunday,2021-02,50-100


In [28]:
# Convert categorical string values to integers
for column in X.columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])


In [29]:
X = X.values

In [30]:
X

array([[51449,   379,    86, ...,     4,     1,     5],
       [42316,   312,     0, ...,     5,     1,     1],
       [ 1563,    78,    78, ...,     1,     2,     1],
       ...,
       [38146,   138,    91, ...,     3,     5,     5],
       [52972,   379,    86, ...,     3,     5,     5],
       [23265,    19,    89, ...,     3,     5,     1]])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# SKlearn models
### TO DO
- grid search, 
- cross validation, 
- confusion matrix, AUC (ROC), all KPI --> think which one is best
- check feature_importance/co-efficent etc.
- take out one column 'last_view_before_cart' and re-run. How much was model dependant on it..

In [12]:
def calculate_models_score(X_train, y_train, X_test, y_test, model_object, modelName, doPrint=True):
    model_object.fit(X_train, y_train)
    y_pred = model_object.predict(X_test)
    score = model_object.score(X_test, y_test)
    if doPrint:
        print(f"{modelName} Score: {score:.4f}")
    return score  

In [17]:
# Logistic regression model
log_reg_score = calculate_models_score(X_train, y_train, X_test, y_test, 
                                       LogisticRegression(max_iter=1000), "Logistic Regression")
# Gradient Boosting Classifier
gbc_score = calculate_models_score(X_train, y_train, X_test, y_test, 
                                       GradientBoostingClassifier(), "Gradient Boosting Classifier")
# Random Forest Classifier               
rfc_score = calculate_models_score(X_train, y_train, X_test, y_test, 
                                       RandomForestClassifier(), "Random Forest Classifier")

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Score: 0.9659
Gradient Boosting Classifier Score: 0.9691
Random Forest Classifier Score: 0.9672


# Deep learning
### TO DO
- optimise first with sklearn
- see how model performs without the columns 'last_view_before_cart'
- save epoch results in .json
- save the model
- call the saved model and apply on X_test to compare results with y_test

In [None]:
# Model setup
model = Sequential()

# Example of applying the Embedding layer; adjust input_dim and output_dim as necessary
input_dim = X_train.max() + 1  # Unique values in the largest label-encoded column
output_dim = 8  # Example dimensionality for the embeddings

model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=X_train.shape[1]))
model.add(Flatten())
model.add(Dense(10, activation='relu'))  # Example Dense layer
model.add(Dense(1, activation='sigmoid'))  # Example output layer for binary classification

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Display the model summary
model.summary()

# Assume you have Y_train
# model.fit(X_train, y_train, epochs=5, batch_size=32)



Epoch 1/5
[1m19348/19348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m90s[0m 5ms/step - accuracy: 0.9671 - loss: 0.0824
Epoch 2/5
[1m19348/19348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 6ms/step - accuracy: 0.9706 - loss: 0.0678
Epoch 3/5
[1m19348/19348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 6ms/step - accuracy: 0.9724 - loss: 0.0667
Epoch 4/5
[1m19348/19348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 5ms/step - accuracy: 0.9735 - loss: 0.0649
Epoch 5/5
[1m19348/19348[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 5ms/step - accuracy: 0.9733 - loss: 0.0647


<keras.src.callbacks.history.History at 0x14dd15270>