## Initializing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use("seaborn-dark")

from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.utils import class_weight
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

from imblearn.metrics import classification_report_imbalanced

import statsmodels.api as sm
import statsmodels.formula.api as smf

import xgboost as xgb
import lightgbm as lgb

## Importing Data

In [2]:
df = pd.read_csv('coding_round_data.csv')

### Fixing import issues

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [4]:
for col in ['OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Month']:
    df[col] = df[col].astype('category')

for col in ['Weekend', 'Revenue']:
    df[col] = df[col].astype('int')

## Splitting Data

Using a stratified split as the data is very imbalanced

In [5]:
df_train, df_test = train_test_split(df, stratify=df['Revenue'], test_size=0.10, random_state=42)
df_train, df_val = train_test_split(df_train, stratify=df_train['Revenue'], test_size=1/9, random_state=42)

## Column Manipulation

In [6]:
nominal_features = ['Month','OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType','Weekend']
nominal_transformer = OneHotEncoder(handle_unknown="ignore")

page_duration_features = ['Administrative_Duration', 'Informational_Duration', 'ProductRelated_Duration']
ss_transformer = StandardScaler()

ga_rates_features = ['BounceRates', 'ExitRates']

preprocessor = ColumnTransformer(
    transformers=[
        ("nominal", nominal_transformer, nominal_features),
        ("page_duration", ss_transformer, page_duration_features),
        ("ga_rates", ss_transformer, ga_rates_features),
    ],
    sparse_threshold=0,
    remainder='passthrough'
)

preprocessor.fit(df_train)

df_train = pd.DataFrame(preprocessor.transform(df_train), columns=preprocessor.get_feature_names_out())
df_val = pd.DataFrame(preprocessor.transform(df_val), columns=preprocessor.get_feature_names_out())
df_test = pd.DataFrame(preprocessor.transform(df_test), columns=preprocessor.get_feature_names_out())

## Modelling

There are 2 main approaches that I will be taking for modelling. Their results will be compared on the df_val set to decide a final approach through the F1-score of the positive class as the goal is to predict whether a user makes a purchase or not.

### Approach 1 - Logistic Regression

In [7]:
formula = "remainder__Revenue ~ " + " + ".join(df_train.columns.drop('remainder__Revenue'))

LR_model = smf.logit(formula=formula, data=df_train).fit_regularized(method='l1')
LR_model.summary()

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.286191599843355
            Iterations: 487
            Function evaluations: 495
            Gradient evaluations: 487


0,1,2,3
Dep. Variable:,remainder__Revenue,No. Observations:,9864.0
Model:,Logit,Df Residuals:,9789.0
Method:,MLE,Df Model:,74.0
Date:,"Sun, 03 Apr 2022",Pseudo R-squ.:,0.3356
Time:,18:38:44,Log-Likelihood:,-2823.0
converged:,True,LL-Null:,-4249.2
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-2.1836,1.48e+06,-1.48e-06,1.000,-2.9e+06,2.9e+06
nominal__Month_Aug,0.1609,2.09e+06,7.71e-08,1.000,-4.09e+06,4.09e+06
nominal__Month_Dec,-0.5800,2.08e+06,-2.79e-07,1.000,-4.07e+06,4.07e+06
nominal__Month_Feb,-1.2774,2.11e+06,-6.05e-07,1.000,-4.14e+06,4.14e+06
nominal__Month_Jul,0.1823,2.1e+06,8.69e-08,1.000,-4.11e+06,4.11e+06
nominal__Month_June,-0.3670,2.09e+06,-1.76e-07,1.000,-4.09e+06,4.09e+06
nominal__Month_Mar,-0.3864,2.09e+06,-1.84e-07,1.000,-4.11e+06,4.11e+06
nominal__Month_May,-0.3565,2.1e+06,-1.7e-07,1.000,-4.11e+06,4.11e+06
nominal__Month_Nov,0.5742,2.09e+06,2.75e-07,1.000,-4.1e+06,4.1e+06


As the classes are not balanced, the intercept/threshold needs to be scaled accordingly.

In [8]:
weights = df_train.remainder__Revenue.value_counts()

threshold = weights[1]/weights[0]

LR_val_preds_raw = LR_model.predict(exog=df_val.drop(columns=['remainder__Revenue']))
LR_val_preds = (LR_val_preds_raw > threshold).astype('int')

print(classification_report_imbalanced(df_val['remainder__Revenue'], LR_val_preds))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.87      0.70      0.91      0.78      0.62      1042
        1.0       0.50      0.70      0.87      0.59      0.78      0.60       191

avg / total       0.87      0.85      0.73      0.86      0.78      0.62      1233



### Approach 2 - Boosted Decision Trees

Implemented using XGBoost

In [9]:
X_train = df_train.drop(columns='remainder__Revenue')
y_train = df_train[['remainder__Revenue']]

X_val = df_val.drop(columns='remainder__Revenue')
y_val = df_val[['remainder__Revenue']]

In [10]:
sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y_train)  # To help tackle imbalanced classes

xgb_model = xgb.XGBClassifier().fit(X_train, y_train, sample_weight=sample_weights)



  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [11]:
xgb_val_preds = xgb_model.predict(X_val)

print(classification_report_imbalanced(y_val, xgb_val_preds))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.91      0.70      0.93      0.80      0.65      1042
        1.0       0.58      0.70      0.91      0.63      0.80      0.62       191

avg / total       0.89      0.88      0.73      0.88      0.80      0.64      1233



## Results

The results from both approaches are close with the logistic regression having an F1 score of 0.59 (for Revenue=True) to the 0.63 for XGBoost

Additionally, the precision for TP is significantly higher for XGBoost (0.58 vs 0.5)

Based on these results, we can focus on using boosted decision trees

### Approach 3 - More Boosted Trees

Comparing XGBoost with LightGBM

In [12]:
lgb_model = lgb.LGBMClassifier().fit(X_train, y_train, sample_weight=sample_weights)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [13]:
lgb_val_preds = lgb_model.predict(X_val)

print(classification_report_imbalanced(y_val, xgb_val_preds))

                   pre       rec       spe        f1       geo       iba       sup

        0.0       0.94      0.91      0.70      0.93      0.80      0.65      1042
        1.0       0.58      0.70      0.91      0.63      0.80      0.62       191

avg / total       0.89      0.88      0.73      0.88      0.80      0.64      1233



## Results

Since they both have matching results, I would prefer LightGBM due to its speed advantage, allowing for better scalability