In [66]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats.stats import pearsonr
from scipy.stats import norm

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

## Loading Data

In [90]:
with open('../data/X_train.npy', 'rb') as f:
    X_train = np.load(f)
with open('../data/X_test.npy', 'rb') as f:
    X_test = np.load(f)
with open('../data/y_train.npy', 'rb') as f:
    y_train = np.load(f)
with open('../data/y_test.npy', 'rb') as f:
    y_test = np.load(f)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13972, 29) (6872, 29) (13972,) (6872,)


In [91]:
columns = ['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType',
 'Month_Aug',
 'Month_Dec',
 'Month_Feb',
 'Month_Jul',
 'Month_June',
 'Month_Mar',
 'Month_May',
 'Month_Nov',
 'Month_Oct',
 'Month_Sep',
 'VisitorType_New_Visitor',
 'VisitorType_Other',
 'VisitorType_Returning_Visitor',
 'Weekend_False',
 'Weekend_True',
 'Revenue']

feature_names = columns
feature_names.remove("Revenue")

In [92]:
X_train = pd.DataFrame(X_train, columns=feature_names)
X_train = X_train.drop("Weekend_False",axis=1)
X_test = pd.DataFrame(X_test, columns=feature_names)
X_test = X_test.drop("Weekend_False",axis=1)

## Tuning Hyperparameters

In [93]:
model = LogisticRegression()

In [94]:
param_dist = {
    'penalty': ['l1', 'l2'],
    'C': np.logspace(-4, 4, 20),
    'solver': ['liblinear']
}

# Random search of parameters, using cross validation, use all cores
model_rand = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=0, n_jobs=-1)

In [95]:
model_rand.fit(X_train, y_train)

Fitting 3 folds for each of 40 candidates, totalling 120 fits




RandomizedSearchCV(cv=3, estimator=LogisticRegression(), n_iter=100, n_jobs=-1,
                   param_distributions={'C': array([1.00000000e-04, 2.63665090e-04, 6.95192796e-04, 1.83298071e-03,
       4.83293024e-03, 1.27427499e-02, 3.35981829e-02, 8.85866790e-02,
       2.33572147e-01, 6.15848211e-01, 1.62377674e+00, 4.28133240e+00,
       1.12883789e+01, 2.97635144e+01, 7.84759970e+01, 2.06913808e+02,
       5.45559478e+02, 1.43844989e+03, 3.79269019e+03, 1.00000000e+04]),
                                        'penalty': ['l1', 'l2'],
                                        'solver': ['liblinear']},
                   random_state=0, verbose=2)

In [99]:
best_params = model_rand.best_params_
best_params

{'solver': 'liblinear', 'penalty': 'l1', 'C': 0.00026366508987303583}

## Fitting the Model

In [100]:
model_best = LogisticRegression(**best_params)

model_best.fit(X_train,y_train)  # fit the model to the training data

LogisticRegression(C=0.00026366508987303583, penalty='l1', solver='liblinear')

In [133]:
def cross_validate_scores(model,X,y,scoring=['accuracy','precision','f1','roc_auc'],cv=5):
    cv_results = cross_validate(model,X,y,scoring=scoring,cv=cv)
    print("%s\n-------------------" % str(model))
    for metric in cv_results:
        print("%s: %f" % (metric, np.mean(cv_results[metric])))

In [101]:
score = cross_val_score(model_best, X_train, y_train)
print(np.round(score.mean(), 4))

0.8605


In [129]:
def get_performance(model,X,y):
    y_pred = model.predict(X)
    target_names = ['0: No Revenue', '1: Revenue']
    report = classification_report(y_test, y_pred, target_names=target_names)
    
    print(report)
    print("Confusion Matrix:\n",confusion_matrix(y,y_pred))
    print("Accuracy: ",accuracy_score(y,y_pred))
    print("ROC-AUC: ",roc_auc_score(y,model.predict_proba(X)[:,1]))

In [131]:
get_performance(model_best,X_test,y_test)

               precision    recall  f1-score   support

0: No Revenue       0.84      0.89      0.86      3436
   1: Revenue       0.88      0.83      0.85      3436

     accuracy                           0.86      6872
    macro avg       0.86      0.86      0.86      6872
 weighted avg       0.86      0.86      0.86      6872

Confusion Matrix:
 [[3045  391]
 [ 582 2854]]
Accuracy:  0.8584109429569267
ROC-AUC:  0.8861682134382103


In [128]:
## Calc Prec, Recall, F1 scores on test set
# y_pred = model_best.predict(X_test)  # predict on test set
# target_names = ['0: No Revenue', '1: Revenue']
# report1 = classification_report(y_test, y_pred, target_names=target_names)
# print(report1)
#report1_df = pd.DataFrame(report1).transpose()
# model_name = str(model_rand).strip('()')
# report1_df.to_csv(f"{model_name}_report1.csv", index=False)

## Feature Engineering

In [104]:
with open('X_train2.npy', 'rb') as f:
    X_train2 = np.load(f)
with open('X_test2.npy', 'rb') as f:
    X_test2 = np.load(f)

In [105]:
LR_model = LogisticRegression(**best_params)
LR_model.fit(X_train2,y_train)  # fit the model to the training data

LogisticRegression(C=0.00026366508987303583, penalty='l1', solver='liblinear')

In [106]:
score = cross_val_score(LR_model, X_train2, y_train)
print(np.round(score.mean(), 4))

0.8605


In [130]:
get_performance(LR_model,X_test2,y_test)

               precision    recall  f1-score   support

0: No Revenue       0.84      0.89      0.86      3436
   1: Revenue       0.88      0.83      0.85      3436

     accuracy                           0.86      6872
    macro avg       0.86      0.86      0.86      6872
 weighted avg       0.86      0.86      0.86      6872

Confusion Matrix:
 [[3045  391]
 [ 582 2854]]
Accuracy:  0.8584109429569267
ROC-AUC:  0.8861789705928192


## Feature Selection and Feature Engineering

In [114]:
with open('X_train3.npy', 'rb') as f:
    X_train3 = np.load(f)
with open('X_test3.npy', 'rb') as f:
    X_test3 = np.load(f)

In [115]:
LR_model2 = LogisticRegression(**best_params)
LR_model2.fit(X_train3,y_train)  # fit the model to the training data

LogisticRegression(C=0.00026366508987303583, penalty='l1', solver='liblinear')

In [116]:
score = cross_val_score(LR_model2, X_train3, y_train)
print(np.round(score.mean(), 4))

0.8604


In [132]:
get_performance(LR_model2,X_test3,y_test)

               precision    recall  f1-score   support

0: No Revenue       0.84      0.89      0.86      3436
   1: Revenue       0.88      0.83      0.85      3436

     accuracy                           0.86      6872
    macro avg       0.86      0.86      0.86      6872
 weighted avg       0.86      0.86      0.86      6872

Confusion Matrix:
 [[3045  391]
 [ 582 2854]]
Accuracy:  0.8584109429569267
ROC-AUC:  0.8861694839682821
