# Classification Models

In [1]:
# Do not change this cell
import datetime
print(datetime.datetime.now())

2020-12-13 14:24:24.749043


In [2]:
# Do not change this cell

import pandas as pd
import numpy as np
import sklearn

print('scikit-learn: {}'.format(sklearn.__version__))
print('pandas: {}'.format(pd.__version__))
print('numpy: {}'.format(np.__version__))

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

scikit-learn: 0.23.2
pandas: 1.1.1
numpy: 1.19.1


In [3]:
# Do not change this cell

import os
os.getcwd()

'/Users/chelsea/Desktop/Individual'

In [4]:
# TODO: Import other packages as necessary
from sklearn.metrics import classification_report, accuracy_score, cohen_kappa_score, f1_score, log_loss, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.neighbors import KNeighborsClassifier
import warnings; warnings.simplefilter('ignore')

In [5]:
# Read in the data
df = pd.read_csv("OJ.csv") 

In [6]:
#to see a few examples of the instances
df.head()

Unnamed: 0,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,Purchase
0,237,1,1.75,1.99,0.0,0.0,0,0,0.5,CH
1,239,1,1.75,1.99,0.0,0.3,0,1,0.6,CH
2,245,1,1.86,2.09,0.17,0.0,0,0,0.68,CH
3,227,1,1.69,1.69,0.0,0.0,0,0,0.4,MM
4,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,CH


In [7]:
#to get the summary statistics of the dataset
df.shape
df.info()
df.describe().transpose()

(1070, 10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070 entries, 0 to 1069
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   WeekofPurchase  1070 non-null   int64  
 1   StoreID         1070 non-null   int64  
 2   PriceCH         1070 non-null   float64
 3   PriceMM         1070 non-null   float64
 4   DiscCH          1070 non-null   float64
 5   DiscMM          1070 non-null   float64
 6   SpecialCH       1070 non-null   int64  
 7   SpecialMM       1070 non-null   int64  
 8   LoyalCH         1070 non-null   float64
 9   Purchase        1070 non-null   object 
dtypes: float64(5), int64(4), object(1)
memory usage: 83.7+ KB


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
WeekofPurchase,1070.0,254.381308,15.558286,227.0,240.0,257.0,268.0,278.0
StoreID,1070.0,3.959813,2.308984,1.0,2.0,3.0,7.0,7.0
PriceCH,1070.0,1.867421,0.10197,1.69,1.79,1.86,1.99,2.09
PriceMM,1070.0,2.085411,0.134386,1.69,1.99,2.09,2.18,2.29
DiscCH,1070.0,0.05186,0.117474,0.0,0.0,0.0,0.0,0.5
DiscMM,1070.0,0.123364,0.213834,0.0,0.0,0.0,0.23,0.8
SpecialCH,1070.0,0.147664,0.354932,0.0,0.0,0.0,0.0,1.0
SpecialMM,1070.0,0.161682,0.368331,0.0,0.0,0.0,0.0,1.0
LoyalCH,1070.0,0.565782,0.307843,1.1e-05,0.325257,0.6,0.850873,0.999947


In [8]:
#create label for each instance
#assume an instance with CH purchased is a postitive case, and an instance with MM purchased is a negative case.
df['PurchaseLabel']= 0
df['PurchaseLabel'][df['Purchase']=='CH']=1

In [9]:
#Check if there is missing values in the dataset
df.isnull().values.any()

False

# Training/Test Split

In [10]:
#remove the columns 'purchase' and 'PurchaseLabel' as they contain label information
remove_cols = ['Purchase','PurchaseLabel']
df2 = df.drop(remove_cols, axis=1)

In [11]:
#to see a few examples of the instances
df2.head()

Unnamed: 0,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH
0,237,1,1.75,1.99,0.0,0.0,0,0,0.5
1,239,1,1.75,1.99,0.0,0.3,0,1,0.6
2,245,1,1.86,2.09,0.17,0.0,0,0,0.68
3,227,1,1.69,1.69,0.0,0.0,0,0,0.4
4,228,7,1.69,1.69,0.0,0.0,0,0,0.956535


In [12]:
#tried to use MinMax method to scale numerical features, however, it doesn't work better than log transformation
#df2['PriceMM'].hist()
#df2['PriceMM'].max()
#df2['PriceMM'].min()
#b=(df2['PriceMM']-1.69)/(2.29-1.69)
#b.hist()

In [13]:
#do apply log transformation on numerical features
df2['PriceCH']=np.log(df2['PriceCH']+1)
df2['PriceMM']=np.log(df2['PriceMM']+1)
df2['DiscCH']=np.log(df2['DiscCH']+1)
df2['DiscMM']=np.log(df2['DiscMM']+1)
df2['LoyalCH']=np.log(df2['LoyalCH']+1)

In [14]:
#split the text and train based on week of purchase 
#so the training dataset has earlier purchases and testing dataset has more recent purchases
#this will help with reducing the issue of data leakage
target_col = 'PurchaseLabel'

y = df[target_col]

train_index = df2['WeekofPurchase'] <= 268
test_index = ~train_index

X_train, X_test = df2[train_index], df2[test_index]
y_train, y_test = y[train_index], y[test_index]

X_train.shape
y_train.shape

X_test.shape
y_test.shape

(810, 9)

(810,)

(260, 9)

(260,)

# Model Development

In [15]:
#Decision Tree

In [16]:
#randomly selected a few hyperparameters as generate model performance baseline
clf = DecisionTreeClassifier(random_state=42, criterion="entropy",
                             min_samples_split=10, min_samples_leaf=10, max_depth=3, max_leaf_nodes=5)
clf.fit(X_train, y_train)

DecisionTreeClassifier(criterion='entropy', max_depth=3, max_leaf_nodes=5,
                       min_samples_leaf=10, min_samples_split=10,
                       random_state=42)

In [17]:
#Predict using the test dataset and generate the model performance measures
y_pred = clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred)) 

F1 Score   = 0.762
Accuracy   = 0.781
Kappa      = 0.524
Log Loss   = 7.572

Confusion Matrix:
        pred:0  pred:1
true:0      65      28
true:1      29     138

Classification  Report:
              precision    recall  f1-score   support

           0       0.69      0.70      0.70        93
           1       0.83      0.83      0.83       167

    accuracy                           0.78       260
   macro avg       0.76      0.76      0.76       260
weighted avg       0.78      0.78      0.78       260



In [18]:
#Hyperparameter tunning using the grid search approach
clf = DecisionTreeClassifier(splitter='best', class_weight=None, random_state=42)

params = {'criterion': ('gini', 'entropy'), 
              'max_depth': [2, 10, 20], 
              'min_samples_leaf': [1, 5, 10],
              'max_features':[None, 'auto'], 
              'max_leaf_nodes':[None, 10, 50],
              'min_samples_split':[1, 5, 10]}

gridsearch = GridSearchCV(clf, params, scoring='f1_macro', cv=5, return_train_score=True)

%time gridsearch.fit(X_train, y_train)

CPU times: user 9.8 s, sys: 113 ms, total: 9.91 s
Wall time: 10.1 s


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=42),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_depth': [2, 10, 20],
                         'max_features': [None, 'auto'],
                         'max_leaf_nodes': [None, 10, 50],
                         'min_samples_leaf': [1, 5, 10],
                         'min_samples_split': [1, 5, 10]},
             return_train_score=True, scoring='f1_macro')

In [19]:
#display the best hyperparameters from the grid search
gridsearch.best_params_
gridsearch.best_score_
gridsearch.best_estimator_

{'criterion': 'gini',
 'max_depth': 2,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_samples_split': 5}

0.8032331390929995

DecisionTreeClassifier(max_depth=2, min_samples_split=5, random_state=42)

In [20]:
#re-fit the model using the best hyperparameters obtained from hyperparameter tuning
clf = DecisionTreeClassifier(random_state=42, criterion="gini",
                             min_samples_split=5, min_samples_leaf=1, max_depth=2, max_leaf_nodes=None, max_features = None)
clf.fit(X_train, y_train)


DecisionTreeClassifier(max_depth=2, min_samples_split=5, random_state=42)

In [21]:
#generate and display the performance measures for the model
y_pred = clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred)) 

F1 Score   = 0.766
Accuracy   = 0.788
Kappa      = 0.532
Log Loss   = 7.306

Confusion Matrix:
        pred:0  pred:1
true:0      62      31
true:1      24     143

Classification  Report:
              precision    recall  f1-score   support

           0       0.72      0.67      0.69        93
           1       0.82      0.86      0.84       167

    accuracy                           0.79       260
   macro avg       0.77      0.76      0.77       260
weighted avg       0.79      0.79      0.79       260



In [22]:
#KNN

In [23]:
#randomly selected a few hyperparameters as generate model performance baseline
knn_clf = KNeighborsClassifier(n_neighbors=3)
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=3)

In [24]:
#Predict using the test dataset and generate the model performance measures
y_pred_knn = knn_clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred_knn, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred_knn)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred_knn)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred_knn)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred_knn])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred_knn, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred_knn)) 

F1 Score   = 0.535
Accuracy   = 0.558
Kappa      = 0.075
Log Loss   = 15.277

Confusion Matrix:
        pred:0  pred:1
true:0      44      49
true:1      66     101

Classification  Report:
              precision    recall  f1-score   support

           0       0.40      0.47      0.43        93
           1       0.67      0.60      0.64       167

    accuracy                           0.56       260
   macro avg       0.54      0.54      0.54       260
weighted avg       0.58      0.56      0.56       260



In [25]:
#Hyperparameter tunning using the grid search approach
knn = KNeighborsClassifier()

params = {'weights': ('uniform', 'distance'), 
              'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
              'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
              'leaf_size':[10, 20, 30]}

gridsearch = GridSearchCV(knn, params, scoring='f1_macro', cv=5, return_train_score=True)

%time gridsearch.fit(X_train, y_train)

CPU times: user 32.3 s, sys: 1.56 s, total: 33.8 s
Wall time: 26.6 s


GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'algorithm': ('auto', 'ball_tree', 'kd_tree', 'brute'),
                         'leaf_size': [10, 20, 30],
                         'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'weights': ('uniform', 'distance')},
             return_train_score=True, scoring='f1_macro')

In [26]:
#display the best hyperparameters from the grid search
gridsearch.best_params_
gridsearch.best_score_
gridsearch.best_estimator_

{'algorithm': 'brute', 'leaf_size': 10, 'n_neighbors': 7, 'weights': 'uniform'}

0.7202633747891805

KNeighborsClassifier(algorithm='brute', leaf_size=10, n_neighbors=7)

In [27]:
#re-fit the model using the best hyperparameters obtained from hyperparameter tuning
knn_clf = KNeighborsClassifier(n_neighbors=7,leaf_size= 10, algorithm ='brute', weights ='uniform')
knn_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='brute', leaf_size=10, n_neighbors=7)

In [28]:
#generate and display the performance measures for the model
y_pred_knn = knn_clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred_knn, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred_knn)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred_knn)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred_knn)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred_knn])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred_knn, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred_knn)) 

F1 Score   = 0.559
Accuracy   = 0.704
Kappa      = 0.215
Log Loss   = 10.229

Confusion Matrix:
        pred:0  pred:1
true:0      17      76
true:1       1     166

Classification  Report:
              precision    recall  f1-score   support

           0       0.94      0.18      0.31        93
           1       0.69      0.99      0.81       167

    accuracy                           0.70       260
   macro avg       0.82      0.59      0.56       260
weighted avg       0.78      0.70      0.63       260



In [29]:
#Random Forest 

In [30]:
#randomly selected a few hyperparameters as generate model performance baseline
RF_clf = RandomForestClassifier(max_depth= None, criterion="gini", max_features = 'auto', random_state=42)
RF_clf.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [31]:
#Predict using the test dataset and generate the model performance measures
y_pred_RF = RF_clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred_RF, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred_RF)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred_RF)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred_RF)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred_RF])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred_RF, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred_RF)) 

F1 Score   = 0.787
Accuracy   = 0.812
Kappa      = 0.575
Log Loss   = 6.509

Confusion Matrix:
        pred:0  pred:1
true:0      61      32
true:1      17     150

Classification  Report:
              precision    recall  f1-score   support

           0       0.78      0.66      0.71        93
           1       0.82      0.90      0.86       167

    accuracy                           0.81       260
   macro avg       0.80      0.78      0.79       260
weighted avg       0.81      0.81      0.81       260



In [32]:
#Hyperparameter tunning using the grid search approach
clf = RandomForestClassifier(random_state=42, criterion='gini')
params = { 
              'n_estimators':[10, 25, 50, 100],
              'min_samples_leaf': [1, 5, 10],
              'min_samples_split':[1, 2, 5, 10],
              'max_depth': [None, 10, 20], 
              'max_features':[None, 'auto'], 
              'max_leaf_nodes':[None, 10, 50]}

gridsearch = GridSearchCV(clf, params, scoring='f1_macro', cv=5, return_train_score=True)

%time gridsearch.fit(X_train, y_train)

CPU times: user 4min 44s, sys: 3.51 s, total: 4min 48s
Wall time: 4min 52s


GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
             param_grid={'max_depth': [None, 10, 20],
                         'max_features': [None, 'auto'],
                         'max_leaf_nodes': [None, 10, 50],
                         'min_samples_leaf': [1, 5, 10],
                         'min_samples_split': [1, 2, 5, 10],
                         'n_estimators': [10, 25, 50, 100]},
             return_train_score=True, scoring='f1_macro')

In [33]:
#display the best hyperparameters from the grid search
gridsearch.best_params_
gridsearch.best_score_
gridsearch.best_estimator_

{'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': 10,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 50}

0.8229326335595044

RandomForestClassifier(max_leaf_nodes=10, n_estimators=50, random_state=42)

In [34]:
#re-fit the model using the best hyperparameters obtained from hyperparameter tuning
RF_clf = RandomForestClassifier(max_depth= None, criterion="gini", max_features = 'auto',max_leaf_nodes =10,min_samples_leaf=1,min_samples_split=2,n_estimators=50, random_state=42)
RF_clf.fit(X_train, y_train)

RandomForestClassifier(max_leaf_nodes=10, n_estimators=50, random_state=42)

In [35]:
#generate and display the performance measures for the model
y_pred_RF = RF_clf.predict(X_test)

print("F1 Score   = {:.3f}".format(f1_score(y_test, y_pred_RF, average="macro")))
print("Accuracy   = {:.3f}".format(accuracy_score(y_test, y_pred_RF)))
print("Kappa      = {:.3f}".format(cohen_kappa_score(y_test, y_pred_RF)))
print("Log Loss   = {:.3f}".format(log_loss(y_test, y_pred_RF)))
print("\nConfusion Matrix:")
unique_label = np.unique([y_test, y_pred_RF])
cmtx = pd.DataFrame(
confusion_matrix(y_test, y_pred_RF, labels=unique_label), 
index=['true:{:}'.format(x) for x in unique_label], 
columns=['pred:{:}'.format(x) for x in unique_label])
print(cmtx)
print("\nClassification  Report:")
print(classification_report(y_test, y_pred_RF)) 

F1 Score   = 0.738
Accuracy   = 0.777
Kappa      = 0.482
Log Loss   = 7.705

Confusion Matrix:
        pred:0  pred:1
true:0      51      42
true:1      16     151

Classification  Report:
              precision    recall  f1-score   support

           0       0.76      0.55      0.64        93
           1       0.78      0.90      0.84       167

    accuracy                           0.78       260
   macro avg       0.77      0.73      0.74       260
weighted avg       0.77      0.78      0.77       260

