In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LogisticRegression
from statsmodels.stats.outliers_influence import variance_inflation_factor
from scipy.stats.stats import pearsonr
from scipy.stats import norm

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
#from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate

from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score

## Loading Data

In [2]:
with open('X_train.npy', 'rb') as f:
    X_train = np.load(f)
with open('X_test.npy', 'rb') as f:
    X_test = np.load(f)
with open('y_train.npy', 'rb') as f:
    y_train = np.load(f)
with open('y_test.npy', 'rb') as f:
    y_test = np.load(f)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(13972, 28) (4069, 28) (13972,) (4069,)


In [3]:
with open('X_train2.npy', 'rb') as f:
    X_train2 = np.load(f)
with open('X_test2.npy', 'rb') as f:
    X_test2 = np.load(f)
# with open('X_train3.npy', 'rb') as f:
#     X_train3 = np.load(f)
# with open('X_test3.npy', 'rb') as f:
#     X_test3 = np.load(f)

In [4]:
columns = ['Administrative',
 'Administrative_Duration',
 'Informational',
 'Informational_Duration',
 'ProductRelated',
 'ProductRelated_Duration',
 'BounceRates',
 'ExitRates',
 'PageValues',
 'SpecialDay',
 'Month_Aug',
 'Month_Dec',
 'Month_Feb',
 'Month_Jul',
 'Month_June',
 'Month_Mar',
 'Month_May',
 'Month_Nov',
 'Month_Oct',
 'Month_Sep',
 'VisitorType_New_Visitor',
 'VisitorType_Other',
 'VisitorType_Returning_Visitor',
 'Weekend_True',
 'OperatingSystems',
 'Browser',
 'Region',
 'TrafficType']

feature_names = columns

In [5]:
X_train = pd.DataFrame(X_train, columns=feature_names)
X_test = pd.DataFrame(X_test, columns=feature_names)

## Tuning Hyperparameters

In [6]:
model = LogisticRegression()

In [7]:
def get_best_params(model,param_dist,X,y):
    model_rand = RandomizedSearchCV(estimator=model, param_distributions=param_dist, n_iter=100, cv=3, verbose=2, random_state=0, n_jobs=-1)
    model_rand.fit(X,y)
    best_params = model_rand.best_params_
    return best_params

In [8]:
param_dist = {
    'penalty': ['none'],
    # 'C': np.logspace(-4, 4, 20),
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga']
}

best_params = get_best_params(model,param_dist,X_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits




In [9]:
print(best_params)

{'solver': 'newton-cg', 'penalty': 'none'}


## Fitting the Model

In [10]:
model_best = LogisticRegression(**best_params)
model_best.fit(X_train,y_train)  # fit the model to the training data

LogisticRegression(penalty='none', solver='newton-cg')

In [11]:
def cross_validate_scores(model,X,y,scoring=['accuracy','precision','f1','roc_auc'],cv=5):
    cv_results = cross_validate(model,X,y,scoring=scoring,cv=cv)
    print("%s\n-------------------" % str(model))
    for metric in cv_results:
        print("%s: %f" % (metric, np.mean(cv_results[metric])))

In [13]:
cross_validate_scores(model_best, X_train, y_train)

LogisticRegression(penalty='none', solver='newton-cg')
-------------------
fit_time: 1.587415
score_time: 0.022830
test_accuracy: 0.851349
test_precision: 0.866965
test_f1: 0.845907
test_roc_auc: 0.925838


In [14]:
score = cross_val_score(model_best, X_train, y_train)
print(np.round(score.mean(), 4))

0.8513


In [15]:
def get_performance(model,X,y):
    y_pred = model.predict(X)
    target_names = ['0: No Revenue', '1: Revenue']
    report = classification_report(y, y_pred, target_names=target_names)
    
    report_file = classification_report(y, y_pred, target_names=target_names, output_dict=True)
    
    print(report_file)
    print('\n')
    
    report_file = pd.DataFrame(report_file).transpose()
    report_file.to_csv(f"{str(model)}_report1.csv",index=False)
    
    print(report)
    print("Confusion Matrix:\n",confusion_matrix(y,y_pred))
    print("Accuracy: ",accuracy_score(y,y_pred))
    print("ROC-AUC: ",roc_auc_score(y,model.predict_proba(X)[:,1]))

In [16]:
get_performance(model_best,X_test,y_test)

{'0: No Revenue': {'precision': 0.9376558603491272, 'recall': 0.8754365541327125, 'f1-score': 0.905478627332932, 'support': 3436}, '1: Revenue': {'precision': 0.502903600464576, 'recall': 0.684044233807267, 'f1-score': 0.5796519410977242, 'support': 633}, 'accuracy': 0.8456623248955517, 'macro avg': {'precision': 0.7202797304068516, 'recall': 0.7797403939699897, 'f1-score': 0.7425652842153281, 'support': 4069}, 'weighted avg': {'precision': 0.8700229823675787, 'recall': 0.8456623248955517, 'f1-score': 0.854790917235393, 'support': 4069}}


               precision    recall  f1-score   support

0: No Revenue       0.94      0.88      0.91      3436
   1: Revenue       0.50      0.68      0.58       633

     accuracy                           0.85      4069
    macro avg       0.72      0.78      0.74      4069
 weighted avg       0.87      0.85      0.85      4069

Confusion Matrix:
 [[3008  428]
 [ 200  433]]
Accuracy:  0.8456623248955517
ROC-AUC:  0.8716365331670796


## Feature Engineering

In [17]:
best_params = get_best_params(model,param_dist,X_train2,y_train)
print(best_params)

model2 = LogisticRegression(**best_params)
model2.fit(X_train2,y_train)  # fit the model to the training data

Fitting 3 folds for each of 4 candidates, totalling 12 fits




{'solver': 'newton-cg', 'penalty': 'none'}


LogisticRegression(penalty='none', solver='newton-cg')

In [18]:
score = cross_val_score(model2, X_train2, y_train)
print(np.round(score.mean(), 4))

0.8797


In [19]:
get_performance(model2,X_test2,y_test)

{'0: No Revenue': {'precision': 0.9473846153846154, 'recall': 0.8961001164144354, 'f1-score': 0.9210290158540234, 'support': 3436}, '1: Revenue': {'precision': 0.5641025641025641, 'recall': 0.7298578199052133, 'f1-score': 0.6363636363636364, 'support': 633}, 'accuracy': 0.8702383878102727, 'macro avg': {'precision': 0.7557435897435898, 'recall': 0.8129789681598243, 'f1-score': 0.7786963261088299, 'support': 4069}, 'weighted avg': {'precision': 0.8877587764901601, 'recall': 0.8702383878102727, 'f1-score': 0.8767446252869516, 'support': 4069}}


               precision    recall  f1-score   support

0: No Revenue       0.95      0.90      0.92      3436
   1: Revenue       0.56      0.73      0.64       633

     accuracy                           0.87      4069
    macro avg       0.76      0.81      0.78      4069
 weighted avg       0.89      0.87      0.88      4069

Confusion Matrix:
 [[3079  357]
 [ 171  462]]
Accuracy:  0.8702383878102727
ROC-AUC:  0.9018265847903527


## Feature Selection and Feature Engineering

In [21]:
with open('X_alt_train.npy', 'rb') as f:
     X_alt_train = np.load(f)
with open('X_alt_test.npy', 'rb') as f:
     X_alt_test = np.load(f)

In [22]:
to_keep = [10, 17, 11, 22, 1, 9, 27, 3, 13, 2, 0, 4, 21, 6, 5, 24, 23, 19, 25, 20]

def get_X(to_keep,X_train,X_test,X_alt_train,X_alt_test):
    X_train_ss = X_train.iloc[:,to_keep]
    X_test_ss = X_test.iloc[:,to_keep]
    
    X_train3 = np.hstack((X_train_ss.to_numpy(),X_alt_train))
    X_test3 = np.hstack((X_test_ss.to_numpy(),X_alt_test))
    
    return X_train3, X_test3

In [24]:
X_train3,X_test3 = get_X(to_keep,X_train,X_test,X_alt_train,X_alt_test)

np.save("X_trainLR.npy",X_train3)
np.save("X_testLR.npy",X_test3)

In [25]:
best_params = get_best_params(model,param_dist,X_train3,y_train)
print(best_params)

model3 = LogisticRegression(**best_params)
model3.fit(X_train3,y_train)  # fit the model to the training data

Fitting 3 folds for each of 4 candidates, totalling 12 fits




{'solver': 'sag', 'penalty': 'none'}




LogisticRegression(penalty='none', solver='sag')

In [26]:
score = cross_val_score(model3, X_train3, y_train)
print(np.round(score.mean(), 4))



0.8735




In [27]:
get_performance(model3,X_test3,y_test)

{'0: No Revenue': {'precision': 0.9568664763717095, 'recall': 0.8780558789289872, 'f1-score': 0.9157687054181212, 'support': 3436}, '1: Revenue': {'precision': 0.5425764192139738, 'recall': 0.7851500789889415, 'f1-score': 0.6417043253712073, 'support': 633}, 'accuracy': 0.8636028508232981, 'macro avg': {'precision': 0.7497214477928417, 'recall': 0.8316029789589643, 'f1-score': 0.7787365153946642, 'support': 4069}, 'weighted avg': {'precision': 0.8924168312056129, 'recall': 0.8636028508232981, 'f1-score': 0.8731334750004027, 'support': 4069}}


               precision    recall  f1-score   support

0: No Revenue       0.96      0.88      0.92      3436
   1: Revenue       0.54      0.79      0.64       633

     accuracy                           0.86      4069
    macro avg       0.75      0.83      0.78      4069
 weighted avg       0.89      0.86      0.87      4069

Confusion Matrix:
 [[3017  419]
 [ 136  497]]
Accuracy:  0.8636028508232981
ROC-AUC:  0.8994720890414107
