In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTENC
import joblib
import tensorflow as tf
from tensorflow.keras import layers, models

In [2]:
# 1. Load data
df = pd.read_csv('online_shoppers_clean.csv')

# 2. Identify categorical/numerical columns
categorical_cols = ['Month', 'VisitorType', 'Weekend']
numerical_cols = [col for col in df.columns if col not in categorical_cols + ['Revenue']]

# 3. Split data
X = df.drop('Revenue', axis=1)
y = df['Revenue'].astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.3, random_state=42
)

# 4. Preprocessing pipeline (no encoding for SMOTENC)
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('num', StandardScaler(), numerical_cols)
])

# 5. Apply SMOTENC (specify categorical indices after preprocessing)
# After OneHotEncoder, categorical features start at index 0
smote_nc = SMOTENC(
    categorical_features=[0, 1, 2],  # Indices of OneHot-encoded categorical columns
    random_state=42
)

# 6. Preprocess training data
X_train_preprocessed = preprocessor.fit_transform(X_train)
X_train_res, y_train_res = smote_nc.fit_resample(X_train_preprocessed, y_train)

In [3]:
# 7. Build and train Keras model
input_dim = X_train_res.shape[1]
model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(input_dim,)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid')
])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [4]:
model.fit(X_train_res, y_train_res, epochs=20, batch_size=64, validation_split=0.1, verbose=1)

Epoch 1/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.6477 - loss: 0.6077 - val_accuracy: 0.7843 - val_loss: 0.4492
Epoch 2/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8095 - loss: 0.4183 - val_accuracy: 0.8308 - val_loss: 0.3875
Epoch 3/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8305 - loss: 0.3930 - val_accuracy: 0.8564 - val_loss: 0.3576
Epoch 4/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8424 - loss: 0.3639 - val_accuracy: 0.8530 - val_loss: 0.3641
Epoch 5/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8442 - loss: 0.3602 - val_accuracy: 0.8800 - val_loss: 0.3092
Epoch 6/20
[1m203/203[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8547 - loss: 0.3273 - val_accuracy: 0.8460 - val_loss: 0.3649
Epoch 7/20
[1m203/203[0m 

<keras.src.callbacks.history.History at 0x7e21ed7259d0>

In [5]:
# 8. Evaluate
X_test_preprocessed = preprocessor.transform(X_test)
y_pred = (model.predict(X_test_preprocessed) > 0.55).astype(int)
print("F1 Score:", f1_score(y_test, y_pred))

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
F1 Score: 0.6477438136826783


In [6]:
# 9. Save pipeline and model
joblib.dump(preprocessor, 'preprocessor.joblib')
model.save('shopper_model.keras')

In [8]:
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

def check_overfitting(model, X_train, y_train, X_test, y_test):
    y_train_pred = (model.predict(X_train) > 0.5).astype(int)
    y_test_pred = (model.predict(X_test) > 0.5).astype(int)
    f1_train = f1_score(y_train, y_train_pred)
    f1_test = f1_score(y_test, y_test_pred)
    print(f"Train F1: {f1_train:.3f}")
    print(f"Test F1:  {f1_test:.3f}")
    if abs(f1_train - f1_test) > 0.1:
        print("Warning: Possible overfitting detected (F1 gap > 0.1).")
    else:
        print("No significant overfitting detected.")



In [None]:

check_overfitting(model, X_train_res, y_train_res, X_test_preprocessed, y_test)

[1m456/456[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m116/116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Train F1: 0.904
Test F1:  0.650


In [9]:
# Display classification report
from sklearn.metrics import classification_report

y_pred = (model.predict(X_test_preprocessed) > 0.5).astype(int)
print(classification_report(y_test, y_pred, digits=3))

[1m115/115[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
              precision    recall  f1-score   support

           0      0.960     0.873     0.915      3090
           1      0.540     0.806     0.647       572

    accuracy                          0.862      3662
   macro avg      0.750     0.839     0.781      3662
weighted avg      0.895     0.862     0.873      3662

