## Group 3 - Project 2
# Online Shopper Purchase Prediction

In [24]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import model as Group3Models

In [None]:
#Read in CSV
online_shopping_df = pd.read_csv('online_shoppers_intention.csv')

# online_shopping_df.info()

online_shopping_df.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12330 entries, 0 to 12329
Data columns (total 18 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Administrative           12330 non-null  int64  
 1   Administrative_Duration  12330 non-null  float64
 2   Informational            12330 non-null  int64  
 3   Informational_Duration   12330 non-null  float64
 4   ProductRelated           12330 non-null  int64  
 5   ProductRelated_Duration  12330 non-null  float64
 6   BounceRates              12330 non-null  float64
 7   ExitRates                12330 non-null  float64
 8   PageValues               12330 non-null  float64
 9   SpecialDay               12330 non-null  float64
 10  Month                    12330 non-null  object 
 11  OperatingSystems         12330 non-null  int64  
 12  Browser                  12330 non-null  int64  
 13  Region                   12330 non-null  int64  
 14  TrafficType           

Administrative  Administrative_Duration  Informational  Informational_Duration  ProductRelated  ProductRelated_Duration  BounceRates  ExitRates  PageValues  SpecialDay  Month  OperatingSystems  Browser  Region  TrafficType  VisitorType        Weekend  Revenue
0               0.000000                 0              0.0                     1               0.000000                 0.200000     0.200000   0.000000    0.0         Mar    2                 2        1       1            Returning_Visitor  False    False      14
                                                                                                                                                                                3                 2        3       1            Returning_Visitor  False    False       7
                                                                                                                                                                         May    2                 2        1    

In [26]:
#Drop the target column, setup X and y datasets for train test split
X = online_shopping_df.drop('Revenue', axis=1)
y = online_shopping_df['Revenue']

X.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True


## Add Encodings to Feature Columns

In [27]:
#Encode VisitorType Column
ordinal_encoder = OrdinalEncoder()
X['VisitorType'] = ordinal_encoder.fit_transform(X[['VisitorType']])

#Ordinal Encode the Month Column
ordinal_encoder = OrdinalEncoder(categories=[['Jan', 'Feb', 'Mar', 'Apr', 'May', 'June', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']])
X['Month'] = ordinal_encoder.fit_transform(X[['Month']])
X.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1.0,1,1,1,1,2.0,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,1.0,2,2,1,2,2.0,False
2,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,1.0,4,1,9,3,2.0,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,1.0,3,2,2,4,2.0,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,1.0,3,3,1,4,2.0,True


## Train Test Split

In [28]:
#Train Test Split
X_test, X_train, y_test, y_train = train_test_split(X, y)
X_train.describe()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType
count,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0,3083.0
mean,2.352579,84.518821,0.481025,31.500625,32.035031,1195.911306,0.022436,0.043363,6.272058,0.06338,6.604282,2.132339,2.323711,3.174505,4.026273,1.71132
std,3.42604,188.487127,1.214722,134.15618,44.905439,1892.318667,0.048992,0.049104,20.546699,0.201982,3.378811,0.932854,1.718976,2.390775,3.950245,0.698351
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,7.0,183.9,0.0,0.013832,0.0,0.0,4.0,2.0,2.0,1.0,2.0,2.0
50%,1.0,5.5,0.0,0.0,18.0,589.421429,0.002817,0.025181,0.0,0.0,6.0,2.0,2.0,3.0,2.0,2.0
75%,4.0,95.25,0.0,0.0,38.0,1481.438726,0.017647,0.05,0.0,0.0,10.0,3.0,2.0,4.0,4.0,2.0
max,27.0,2720.5,13.0,2256.916667,584.0,29970.46597,0.2,0.2,360.953384,1.0,11.0,8.0,13.0,9.0,20.0,2.0


## Create a Model and predict

In [29]:
#Create a Random Forest Classifier
model = Group3Models.don_model()
model.fit(X_train, y_train)
model.score(X_test, y_test)
y_pred = model.predict(X_test)

In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       False       0.93      0.95      0.94      7842
        True       0.67      0.60      0.63      1405

    accuracy                           0.89      9247
   macro avg       0.80      0.77      0.79      9247
weighted avg       0.89      0.89      0.89      9247

