In [1]:
import pandas as pd
import sys
import os
import json

import matplotlib.pyplot as plt
import seaborn as sns

import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# admin libraries
sys.path.append(os.path.abspath(".."))
from utils import helper_functions as hf

2025-04-17 17:38:33.106362: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
print(tf.__version__)

In [None]:
# get the data
filepath = '../data/events_ml.csv' 
df = pd.read_csv(filepath) 
# convert all integer columns to object
int_columns = df.select_dtypes(include='int64').columns
df[int_columns] = df[int_columns].astype('object')

# define x and y
y = df.pop('last_view_before_purchase').astype('int')
X = df
# convert categorical string values to integers
for column in X.columns:
    le = LabelEncoder()
    X[column] = le.fit_transform(X[column])
X = X.values
#scaler = StandardScaler()
#X_normalised = scaler.fit_transform(X)
# set up x and y for test and train split
X_train, X_test, y_train, y_test = train_test_split(X_normalised, y, test_size=0.3, random_state=42)

In [None]:
print("class distribution in y_train:", pd.Series(y_train).value_counts())
print("class distribution in y_test:", pd.Series(y_test).value_counts())

In [None]:
# model setup
model = Sequential()

# applying the embedding layer; adjust input_dim and output_dim as necessary
input_dim = X_train.max() + 1  # unique values in the largest label-encoded column
output_dim = 1  # dimensionality for the embeddings

model.add(Embedding(input_dim=input_dim, output_dim=output_dim))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))  # output layer for binary classification


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.1), 
    loss=tf.keras.losses.BinaryCrossentropy(),
    metrics=[tf.keras.metrics.BinaryAccuracy(threshold=0.5, name='accuracy')]
)
   

# display the model summary
model.summary()

# checkpoints
early_stopping = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(filepath='../models/best_model.keras', monitor='loss', save_best_only=True)

# running the model
history = model.fit(
    X_train, 
    y_train, 
    epochs=5,
    batch_size=32,
    callbacks=[early_stopping, model_checkpoint],
    verbose=2)

# save training history
with open('../models/training_history.json', 'w') as file: # open creates a file
    json.dump(history.history, file) # dump saves it as a json file
     

In [None]:
# get predictions
probabilities = model.predict(X_test)

In [None]:
# convert probabilities to binary predictions
predictions = (probabilities > 0.5).astype("int32").flatten()  # ensure predictions is a 1d array

# convert predictions to a pandas series (optional, if needed for compatibility)
predictions = pd.Series(predictions, name="predictions")

In [None]:
print("accuracy:", accuracy_score(y_test, predictions))
print("classification report:\n", classification_report(y_test, predictions))

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, predictions)

# plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.title("confusion matrix")
plt.xlabel("predicted class")
plt.ylabel("true class")
plt.show()