## Initializing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use("seaborn-dark")

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.utils import class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import RandomUnderSampler

import statsmodels.api as sm
import statsmodels.formula.api as smf

import xgboost as xgb
import lightgbm as lgb

from flaml import AutoML

## Importing Data

In [2]:
df = pd.read_csv('coding_round_data.csv')

### Fixing import issues

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [4]:
for col in ['OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Month']:
    df[col] = df[col].astype('category')

for col in ['Weekend', 'Revenue']:
    df[col] = df[col].astype('int')

## Splitting Data

Using a stratified split as the data is very imbalanced

In [5]:
df_train, df_test = train_test_split(df, stratify=df['Revenue'], test_size=0.10, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['Revenue'], test_size=1/9, random_state=42)

## Data Manipulation

In [6]:
nominal_features = ['Month','OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType','Weekend']
nominal_transformer = OneHotEncoder(handle_unknown="ignore")

page_duration_features = ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']
ss_transformer = StandardScaler()

ga_rates_features = ['BounceRates', 'ExitRates']

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_transformer, nominal_features),
        ("page_duration", ss_transformer, page_duration_features),
        ("ga_rates", ss_transformer, ga_rates_features),
    ],
    sparse_threshold=0,
    remainder='passthrough'
)

preprocessor.fit(df_train)

df_train = pd.DataFrame(preprocessor.transform(df_train), columns=preprocessor.get_feature_names_out())
df_val = pd.DataFrame(preprocessor.transform(df_val), columns=preprocessor.get_feature_names_out())
df_test = pd.DataFrame(preprocessor.transform(df_test), columns=preprocessor.get_feature_names_out())

In [7]:
X_train = df_train.drop(columns='remainder__Revenue')
y_train = df_train[['remainder__Revenue']]

X_val = df_val.drop(columns='remainder__Revenue')
y_val = df_val[['remainder__Revenue']]

X_test = df_test.drop(columns='remainder__Revenue')
y_test = df_test[['remainder__Revenue']]

In [None]:
categorical_features = [idx for idx, i in enumerate(preprocessor.get_feature_names_out()) if i.startswith('nominal')]

sm = SMOTENC(random_state=42, categorical_features=categorical_features)
X_train, y_train = sm.fit_resample(X_train, y_train)

## Modelling

There are 2 main approaches that I will be taking for modelling. Their results will be compared on the df_val set to decide a final approach through the F1-score of the positive class as the goal is to predict whether a user makes a purchase or not.

### Approach 1 - Logistic Regression

In [None]:
formula = "remainder__Revenue ~ " + " + ".join(X_train)

LR_model = smf.logit(formula=formula, data=df_train).fit_regularized(method='l1')
LR_model.summary()

As the classes are not balanced, the intercept/threshold needs to be scaled accordingly.

In [None]:
weights = y_val.value_counts()

threshold = weights[1]/weights[0]

LR_val_preds_raw = LR_model.predict(exog=X_val)
LR_val_preds = (LR_val_preds_raw > threshold).astype('int')

print(classification_report_imbalanced(y_val, LR_val_preds))

### Approach 2 - Boosted Decision Trees

Implemented using XGBoost

In [None]:
sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)  # To help tackle imbalanced classes

xgb_model = xgb.XGBClassifier().fit(X_train, y_train, sample_weight=sample_weights)

In [None]:
xgb_val_preds = xgb_model.predict(X_val)

print(classification_report_imbalanced(y_val, xgb_val_preds))

## Results

The results from both approaches are close with the logistic regression having an F1 score of 0.59 (for Revenue=True) to the 0.63 for XGBoost

Additionally, the precision for TP is significantly higher for XGBoost (0.58 vs 0.51) while the recall is the same

Based on these results, we can focus on using boosted decision trees

### Approach 3 - More Boosted Trees

Comparing XGBoost with LightGBM

In [None]:
lgb_model = lgb.LGBMClassifier().fit(X_train, y_train, sample_weight=sample_weights)

In [None]:
lgb_val_preds = lgb_model.predict(X_val)

print(classification_report_imbalanced(y_val, xgb_val_preds))

## Results

Since they both have matching results, I would prefer LightGBM due to its speed advantage, allowing for better scalability

## Hyperparameter tuning

Via FLAML

In [None]:
automl = AutoML()
settings = {
    "time_budget": 60,
    "metric": 'f1',
    "estimator_list": ['lgbm'],
    "task": 'classification',
    "seed": 42,
}
automl.fit(X_train=X_train, y_train=y_train['remainder__Revenue'], **settings)

In [None]:
best_estimator = automl.model.estimator

best_val_preds = best_estimator.predict(X_val)
print("Val set")
print(classification_report_imbalanced(y_val, best_val_preds))

print("-"*100)

best_test_preds = best_estimator.predict(X_test)
print("Test set")
print(classification_report_imbalanced(y_test, best_test_preds))

## Results

Based on these results, we can expect a final F1 score of 0.64

## Feature Importance

In [None]:
lgb.plot_importance(best_estimator, max_num_features=20)