# 6. Data Preparation (7 Points):

In [11]:
#Importing all relevant packages
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import plotly.express as px
from sklearn.model_selection import train_test_split
from matplotlib import rcParams
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier, GradientBoostingClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
import pickle
from plotly.subplots import make_subplots
import plotly.graph_objs as go
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

# 6.1 Load the dataset and display the dataframe (2 Points).

In [42]:
#Reading in Data Set and displaying first 5 rows
OSI_g2 = pd.read_csv("../datasets/online_shoppers_intention.csv")

# Converting to Integer
OSI_g2["Weekend"] = OSI_g2["Weekend"].astype(int)
OSI_g2["Revenue"] = OSI_g2["Revenue"].astype(int)

#Adding a numeric version of visitor type and dropping original column
OSI_g2.insert(loc=16, column="VisitorTypeNumeric", value=pd.factorize(OSI_g2['VisitorType'])[0] + 1)
OSI_g2 = OSI_g2.drop('VisitorType', axis=1)

#Renaming columns to remove the underscore
new_col_names_g2 = {
    'Administrative_Duration' : 'AdministrativeDuration',
    'Informational_Duration' : 'InformationalDuration',
    'ProductRelated_Duration' : 'ProductRelatedDuration'
}
OSI_g2.rename(columns=new_col_names_g2, inplace=True)

#Converting Months to Numeric so that our models can correctly analyze its input
month_numeric_encoding_g2 = {
    "Jan": 1, "Feb": 2, "Mar": 3, "Apr": 4,
    "May": 5, "Jun": 6, "Jul": 7, "Aug": 8,
    "Sep": 9, "Oct": 10, "Nov": 11, "Dec": 12
}
OSI_g2["Month"] = OSI_g2["Month"].map(month_numeric_encoding_g2)

#Printing data set
OSI_g2.head(6)

Unnamed: 0,Administrative,AdministrativeDuration,Informational,InformationalDuration,ProductRelated,ProductRelatedDuration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorTypeNumeric,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,1,1,1,1,1,0,0
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,2.0,2,2,1,2,1,0,0
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,2.0,4,1,9,3,1,0,0
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,2.0,3,2,2,4,1,0,0
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,2.0,3,3,1,4,1,1,0
5,0,0.0,0,0.0,19,154.216667,0.015789,0.024561,0.0,0.0,2.0,2,2,1,3,1,0,0


# 6.2 Use describe to provide statistics on the pandas Dataframe (2 Points).

In [43]:
#Using describe function to display statistics on the dataframe
OSI_g2.describe()

Unnamed: 0,Administrative,AdministrativeDuration,Informational,InformationalDuration,ProductRelated,ProductRelatedDuration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorTypeNumeric,Weekend,Revenue
count,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12042.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0,12330.0
mean,2.315166,80.818611,0.503569,34.472398,31.731468,1194.74622,0.022191,0.043073,5.889258,0.061427,7.691496,2.124006,2.357097,3.147364,4.069586,1.151176,0.232603,0.154745
std,3.321784,176.779107,1.270156,140.749294,44.475503,1913.669288,0.048488,0.048597,18.568437,0.198917,3.423429,0.911325,1.717277,2.401591,4.025169,0.376989,0.422509,0.361676
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,184.1375,0.0,0.014286,0.0,0.0,5.0,2.0,2.0,1.0,2.0,1.0,0.0,0.0
50%,1.0,7.5,0.0,0.0,18.0,598.936905,0.003112,0.025156,0.0,0.0,8.0,2.0,2.0,3.0,2.0,1.0,0.0,0.0
75%,4.0,93.25625,0.0,0.0,38.0,1464.157214,0.016813,0.05,0.0,0.0,11.0,3.0,2.0,4.0,4.0,1.0,0.0,0.0
max,27.0,3398.75,24.0,2549.375,705.0,63973.52223,0.2,0.2,361.763742,1.0,12.0,8.0,13.0,9.0,20.0,3.0,1.0,1.0


# 6.3 Split the dataset into a Training set and a Test set. Justify your preferred split (3 Points).

In [44]:
#First, we need to impute missing values before proceeding with train/test
imputer = SimpleImputer(strategy='most_frequent')
OSI_imputed_g2 = pd.DataFrame(imputer.fit_transform(OSI_g2), columns=OSI_g2.columns)

In [45]:
#Setting up our X and Y based on our input variables and our target variable being Revenue
X_g2 = OSI_imputed_g2.loc[:, OSI_imputed_g2.columns != 'Revenue'].to_numpy()
y_g2 = OSI_imputed_g2.iloc[:,-1:].to_numpy()

#Using the train_test_split function with 70/30 split since the dataset is imbalanced.
X_train_g2, X_test_g2, y_train_g2, y_test_g2 = train_test_split(X_g2, y_g2, test_size=0.30, random_state=385)

**Because our data set is imbalanced, we decided to use a 70-30 split, rather than 80-20. This will allow our models to train on a greater percentage of the data set to hopefully include the handful of positives in our target variable.**

# 7. Classification Routine (12 Points):
Execute a classification routine using RandomForestClassifier(), BaggingClassifier(), and XGboostclassifier(). Independently output the accuracy box plot as discussed in class. Use any package you are comfortable with (seaborn, matplotlib).

## 7.1 RandomForestClassifier():

In [48]:
#Creating our instance of Bagging Classifier using 100 trees.
clf_g2 = RandomForestClassifier(n_estimators=100, random_state=385)

#Creating a new variable that represents the fitted BaggingClassifier model. 
rf_fit_g2 = clf_g2.fit(X_train_g2, y_train_g2)

#Store the predicted target values in variable new variable.
y_pred_g2 = rf_fit_g2.predict(X_test_g2)

print(classification_report(y_test_g2, y_pred_g2))

  rf_fit_g2 = clf_g2.fit(X_train_g2, y_train_g2)


              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      3127
         1.0       0.74      0.55      0.63       572

    accuracy                           0.90      3699
   macro avg       0.83      0.76      0.79      3699
weighted avg       0.89      0.90      0.89      3699



## 7.2. BaggingClassifier():

In [49]:
#Creating our instance of Bagging Classifier using 100 trees.
clf_g2 = BaggingClassifier(n_estimators=100, random_state=385)

#Creating a new variable that represents the fitted BaggingClassifier model. 
bag_fit_g2 = clf_g2.fit(X_train_g2, y_train_g2)

#Store the predicted target values in variable new variable.
y_pred_g2 = bag_fit_g2.predict(X_test_g2)

print(classification_report(y_test_g2, y_pred_g2))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         0.0       0.92      0.96      0.94      3127
         1.0       0.73      0.57      0.64       572

    accuracy                           0.90      3699
   macro avg       0.83      0.77      0.79      3699
weighted avg       0.90      0.90      0.90      3699



## 7.3. XGboostclassifier():

In [50]:
#Creating our instance of Bagging Classifier using 100 trees.
clf_g2 = XGBClassifier(n_estimators=100, random_state=385)

#Creating a new variable that represents the fitted BaggingClassifier model. 
xgb_fit_g2 = clf_g2.fit(X_train_g2, y_train_g2)

#Store the predicted target values in variable new variable.
y_pred_g2 = xgb_fit_g2.predict(X_test_g2)

print(classification_report(y_test_g2, y_pred_g2))

              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94      3127
         1.0       0.71      0.58      0.64       572

    accuracy                           0.90      3699
   macro avg       0.82      0.77      0.79      3699
weighted avg       0.89      0.90      0.89      3699



## Accuracy Boxplot:

# 8. Classification with GridSearchCV (8 Points):

# 9. Classification with RandomSearchCV (8 Points):

# 10. Comparison and Analysis (5 Points):