## Initializing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
plt.style.use("seaborn-dark")

from sklearn.model_selection import train_test_split
from sklearn.metrics import *

from imblearn.metrics import classification_report_imbalanced

import statsmodels.api as sm
import statsmodels.formula.api as smf

import xgboost as xgb

## Importing Data

In [2]:
df = pd.read_csv('coding_round_data.csv')

### Fixing import issues

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

In [4]:
for col in ['OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Month']:
    df[col] = df[col].astype('category')

for col in ['Weekend', 'Revenue']:
    df[col] = df[col].astype('int')

## Splitting Data

Using a stratified split as the data is very imbalanced

In [5]:
df_train, df_test = train_test_split(df, stratify=df['Revenue'], test_size=0.20, random_state=42)

## Modelling

There are 2 main approaches that I will be taking for modelling. Their results will be compared on the df_test set to decide a final approach

### Approach 1 - Logistic Regression

In [6]:
formula = "Revenue ~ " + " + ".join(df_train.columns.drop('Revenue'))

LR_model = smf.logit(formula=formula, data=df_train).fit_regularized(method='l1')
LR_model.summary()

  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.2831479736005282
            Iterations: 444
            Function evaluations: 465
            Gradient evaluations: 444


0,1,2,3
Dep. Variable:,Revenue,No. Observations:,9864.0
Model:,Logit,Df Residuals:,9795.0
Method:,MLE,Df Model:,68.0
Date:,"Sun, 03 Apr 2022",Pseudo R-squ.:,0.3427
Time:,17:06:08,Log-Likelihood:,-2793.0
converged:,True,LL-Null:,-4249.2
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.9108,,,,,
Month[T.Dec],-0.7004,0.175,-3.996,0.000,-1.044,-0.357
Month[T.Feb],-2.0139,0.620,-3.249,0.001,-3.229,-0.799
Month[T.Jul],-0.1133,0.447,-0.254,0.800,-0.989,0.762
Month[T.June],-0.3465,0.491,-0.706,0.480,-1.308,0.615
Month[T.Mar],-0.6664,0.218,-3.058,0.002,-1.093,-0.239
Month[T.May],-0.5919,0.250,-2.369,0.018,-1.082,-0.102
Month[T.Nov],0.3265,0.129,2.539,0.011,0.074,0.578
Month[T.Oct],-0.0968,0.160,-0.607,0.544,-0.410,0.216


As the classes are not balanced, the intercept/threshold needs to be scaled accordingly.

In [7]:
weights = df_train.Revenue.value_counts()

threshold = weights[1]/weights[0]

LR_test_preds_raw = LR_model.predict(exog=df_test.drop(columns=['Revenue']))
LR_test_preds = (LR_test_preds_raw > threshold).astype('int')

print(classification_report_imbalanced(df_test['Revenue'], LR_test_preds))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.94      0.87      0.71      0.91      0.79      0.63      2084
          1       0.50      0.71      0.87      0.59      0.79      0.61       382

avg / total       0.88      0.85      0.74      0.86      0.79      0.63      2466

