# Classification Modeling

# Loading the Libraries.

In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
#import statsmodels.api as sm
#import statsmodels.formula.api as smf

sns.set(style='ticks', palette='Set2')
%matplotlib inline 

# Reading in the Data

In [2]:
#Import the OJ data set. 

df_oj_raw = pd.read_csv('OJ.csv')
print(df_oj_raw.head())
print(df_oj_raw.shape)

   Unnamed: 0 Purchase  WeekofPurchase  StoreID  PriceCH  PriceMM  DiscCH  \
0           1       CH             237        1     1.75     1.99    0.00   
1           2       CH             239        1     1.75     1.99    0.00   
2           3       CH             245        1     1.86     2.09    0.17   
3           4       MM             227        1     1.69     1.69    0.00   
4           5       CH             228        7     1.69     1.69    0.00   

   DiscMM  SpecialCH  SpecialMM   LoyalCH  SalePriceMM  SalePriceCH  \
0     0.0          0          0  0.500000         1.99         1.75   
1     0.3          0          1  0.600000         1.69         1.75   
2     0.0          0          0  0.680000         2.09         1.69   
3     0.0          0          0  0.400000         1.69         1.69   
4     0.0          0          0  0.956535         1.69         1.69   

   PriceDiff Store7  PctDiscMM  PctDiscCH  ListPriceDiff  STORE  
0       0.24     No   0.000000   0.000000   

# Data Cleaning and Processing

In [3]:
df_oj_raw.columns.values

array(['Unnamed: 0', 'Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH',
       'PriceMM', 'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH',
       'SalePriceMM', 'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM',
       'PctDiscCH', 'ListPriceDiff', 'STORE'], dtype=object)

In [4]:
df_oj = df_oj_raw.drop(columns=['Unnamed: 0'])
df_oj.head()

# Determine which variables are Categorical and which are continuous.

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


In [5]:
print(df_oj.shape)

(1070, 18)


In [6]:
df_oj.columns.values

array(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM',
       'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH',
       'SalePriceMM', 'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM',
       'PctDiscCH', 'ListPriceDiff', 'STORE'], dtype=object)

In [8]:
df_oj.describe()

# Note non-numeric columns were dropped. 

Unnamed: 0,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,254.381308,3.959813,1.867421,2.085411,0.05186,0.123364,0.147664,0.161682,0.565782,1.962047,1.815561,0.146486,0.059298,0.027314,0.217991,1.630841
std,15.558286,2.308984,0.10197,0.134386,0.117474,0.213834,0.354932,0.368331,0.307843,0.252697,0.143384,0.271563,0.10176,0.062232,0.107535,1.430387
min,227.0,1.0,1.69,1.69,0.0,0.0,0.0,0.0,1.1e-05,1.19,1.39,-0.67,0.0,0.0,0.0,0.0
25%,240.0,2.0,1.79,1.99,0.0,0.0,0.0,0.0,0.325257,1.69,1.75,0.0,0.0,0.0,0.14,0.0
50%,257.0,3.0,1.86,2.09,0.0,0.0,0.0,0.0,0.6,2.09,1.86,0.23,0.0,0.0,0.24,2.0
75%,268.0,7.0,1.99,2.18,0.0,0.23,0.0,0.0,0.850873,2.13,1.89,0.32,0.112676,0.0,0.3,3.0
max,278.0,7.0,2.09,2.29,0.5,0.8,1.0,1.0,0.999947,2.29,2.09,0.64,0.40201,0.252688,0.44,4.0


In [9]:
df_oj.head()

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


In [10]:
# seperate the data into target and features. 

# Convert the target feature into 0 and 1 instead of CH and MM. with MM = 1 (MinuteMaid) and CH = 0.

df_oj['MM'] = df_oj['Purchase'].apply(lambda x: 1 if x == 'MM' else 0)

df_oj.head()

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE,MM
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1,0
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1,0
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1,0
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0,0


## Separating the Data into Training, Validation, and Testing.

In [11]:
# Create matrices
from patsy import dmatrices
y, X = dmatrices('MM ~ WeekofPurchase + C(StoreID) + PriceCH + PriceMM + DiscCH + DiscMM + SpecialCH + SpecialMM + LoyalCH + SalePriceMM + SalePriceCH + PriceDiff +  PctDiscMM + PctDiscCH + ListPriceDiff', df_oj, return_type = 'dataframe')
# Do not keep Store7 and STORE variables, because are only keeping StoreID. And the target feature is MM. 

#y = df_oj[['Purchase']]
print(y.shape)
print(y.head())
#X = df_oj.drop(columns=['Purchase'])
print(X.shape)
X.head()

(1070, 1)
    MM
0  0.0
1  0.0
2  0.0
3  1.0
4  0.0
(1070, 19)


Unnamed: 0,Intercept,C(StoreID)[T.2],C(StoreID)[T.3],C(StoreID)[T.4],C(StoreID)[T.7],WeekofPurchase,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,PctDiscMM,PctDiscCH,ListPriceDiff
0,1.0,0.0,0.0,0.0,0.0,237.0,1.75,1.99,0.0,0.0,0.0,0.0,0.5,1.99,1.75,0.24,0.0,0.0,0.24
1,1.0,0.0,0.0,0.0,0.0,239.0,1.75,1.99,0.0,0.3,0.0,1.0,0.6,1.69,1.75,-0.06,0.150754,0.0,0.24
2,1.0,0.0,0.0,0.0,0.0,245.0,1.86,2.09,0.17,0.0,0.0,0.0,0.68,2.09,1.69,0.4,0.0,0.091398,0.23
3,1.0,0.0,0.0,0.0,0.0,227.0,1.69,1.69,0.0,0.0,0.0,0.0,0.4,1.69,1.69,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0,1.0,228.0,1.69,1.69,0.0,0.0,0.0,0.0,0.956535,1.69,1.69,0.0,0.0,0.0,0.0


In [12]:
# Seperate the data into train, validation, and test sets by doing a 60-20-20 split. 
# Training = train + validation

from sklearn.model_selection import train_test_split

# First split data into Training and Test data 80-20
#
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Now split the Training data into Train and validation sets with 75-25 ratio so that the final split is 60-20-20. 
#
X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.25, random_state=42)

# Print and check if things make sense
print(X.shape[0]*.6, X.shape[0]*.2)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

# Note that we are doing a 60-20-20 split for ease of computation. In fact, since our data size is actually relatively small, 
# only 1070 observations, cross-validation should be done. 


642.0 214.0
(642, 19) (214, 19) (214, 19)
(642, 1) (214, 1) (214, 1)


## Scaling the Data

In [13]:
# Scale the training, validation and test sets.

# Data has categorical variables, the below will not work. Need to take care of those, e.g. using embeddings, before
# scaling the data. 

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

print('shape for training dataset:', X_train_s.shape)
print('shape for validation dataset:', X_val_s.shape)
print('shape for testing dataset:', X_test_s.shape)




shape for training dataset: (642, 19)
shape for validation dataset: (642, 19)
shape for testing dataset: (214, 19)


In [14]:
# This MinMax method perhaps is not as appropriate as the normalization method. 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_ss = scaler.fit_transform(X_train)
X_val_ss = scaler.transform(X_val)
X_test_ss = scaler.transform(X_test)

print('shape for training dataset:', X_train.shape)
print('shape for validation dataset:', X_val.shape)
print('shape for testing dataset:', X_test.shape)

shape for training dataset: (642, 19)
shape for validation dataset: (214, 19)
shape for testing dataset: (214, 19)


In [15]:
# Build models with Purchase as the target feature. 

# Model 1: Logistic regression model
# Model 2: RandomForest model
# Model 3: XGboost model
# Model 4: NeuralNet
# Model 5: 

print(X_train.columns.values)
#X_train.head()

['Intercept' 'C(StoreID)[T.2]' 'C(StoreID)[T.3]' 'C(StoreID)[T.4]'
 'C(StoreID)[T.7]' 'WeekofPurchase' 'PriceCH' 'PriceMM' 'DiscCH' 'DiscMM'
 'SpecialCH' 'SpecialMM' 'LoyalCH' 'SalePriceMM' 'SalePriceCH' 'PriceDiff'
 'PctDiscMM' 'PctDiscCH' 'ListPriceDiff']


In [16]:
X_test.head()
y_test.head()

Unnamed: 0,MM
644,0.0
629,1.0
70,0.0
962,0.0
787,0.0


In [17]:
import re

fm = re.sub(' ', '+', X_train.columns.values)
print(fm)

[re.sub("(,[ ]*!.*)$", "", x) for x in strings]

TypeError: ignored

In [18]:
import re
strings = ["Important text,      !Comment that could be removed", "Other String"]
[re.sub("(,[ ]*!.*)$", "", x) for x in strings]

['Important text', 'Other String']

# Machine Learning Models

#### Helper function for Evaluation metrics

In [0]:
# Defining a function to produce Evaluation metrics

from sklearn.metrics.classification import classification_report, accuracy_score, confusion_matrix, ...

# Writing the evaluation metrics as a function. 

def eval_metrics(y_test, y_pred):

  # Using functions from sklearn.metrics.classification. 
  print("Confusion Matrix")
  print(confusion_matrix( y_test, y_pred))
  print("Othere Accuracy Metrics")
  accuracy = accuracy_score(y_pred, y_test)
  print('accuracy: %.1f%%' % (accuracy * 100.0))
  print(classification_report(y_pred, y_test))


## Logistic Regression

In [19]:
# Model 1: Logistic regression model

#from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(random_state=0, solver='lbfgs',
#                         multi_class='multinomial').fit(X_train, y_train)

#clf.predict(X_train)


# module imports
from sklearn.linear_model import LogisticRegression
import statsmodels.discrete.discrete_model as sm

# sklearn output
model = LogisticRegression(fit_intercept = False) #, solver='lbfgs')
#y_train_array = column_or_1d(y_train, warn=True)
mdl = model.fit(X_training, y_training.values.ravel())
model.coef_
y_test_pred = model.predict(X_test)
y_training_pred = model.predict(X_training)

# Statsmodels
# sm
#logit = sm.Logit(y_train, X_train)
#results = logit.fit()




In [27]:
# Evaluation Metrics for Training data - Logistic Regression

eval_metrics(y_training, y_training_pred)

Confusion Matrix
[[465  58]
 [ 76 257]]
Othere Accuracy Metrics
accuracy: 84.3%
              precision    recall  f1-score   support

         0.0       0.89      0.86      0.87       541
         1.0       0.77      0.82      0.79       315

    accuracy                           0.84       856
   macro avg       0.83      0.84      0.83       856
weighted avg       0.85      0.84      0.84       856



In [28]:
# Evaluation Metrics for Test data - Logistic Regression


eval_metrics(y_test, y_test_pred)

Confusion Matrix
[[117  13]
 [ 25  59]]
Othere Accuracy Metrics
accuracy: 82.2%
              precision    recall  f1-score   support

         0.0       0.90      0.82      0.86       142
         1.0       0.70      0.82      0.76        72

    accuracy                           0.82       214
   macro avg       0.80      0.82      0.81       214
weighted avg       0.83      0.82      0.83       214



## Random Forest

In [24]:
# Model 2: RandomForest model
from sklearn.ensemble import RandomForestClassifier

# Create a random forest Classifier. # number of trees, depth of each tree. 
rf_clf = RandomForestClassifier(n_jobs=2, random_state=42)

# Training the random forest Classifier 
rf_clf.fit(X_training, y_training.values.ravel())

rf_training_pred = rf_clf.predict(X_training)

rf_test_pred = rf_clf.predict(X_test)





In [25]:
# Evaluation metrics for Training set

eval_metrics(y_training, rf_training_pred)

Confusion Matrix
[[513  10]
 [ 15 318]]
Othere Accuracy Metrics
accuracy: 97.1%
              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98       528
         1.0       0.95      0.97      0.96       328

    accuracy                           0.97       856
   macro avg       0.97      0.97      0.97       856
weighted avg       0.97      0.97      0.97       856



In [26]:
# Evaluation metrics for Testing 

eval_metrics(y_test, rf_test_pred)

Confusion Matrix
[[102  28]
 [ 25  59]]
Othere Accuracy Metrics
accuracy: 75.2%
              precision    recall  f1-score   support

         0.0       0.78      0.80      0.79       127
         1.0       0.70      0.68      0.69        87

    accuracy                           0.75       214
   macro avg       0.74      0.74      0.74       214
weighted avg       0.75      0.75      0.75       214



## Naive Bayes

In [0]:
from sklearn.naive_bayes import GaussianNB

nb = GaussianNB()
nb = nb.fit(X_training, y_training.values.ravel())
nb

nb_training_pred = nb.predict(X_training)
nb_test_pred = nb.predict(X_test)




In [32]:
# Evaluation metrics for Naive Bayes - Training Set

eval_metrics(y_training, nb_training_pred)

Confusion Matrix
[[384 139]
 [ 62 271]]
Othere Accuracy Metrics
accuracy: 76.5%
              precision    recall  f1-score   support

         0.0       0.73      0.86      0.79       446
         1.0       0.81      0.66      0.73       410

    accuracy                           0.77       856
   macro avg       0.77      0.76      0.76       856
weighted avg       0.77      0.77      0.76       856



In [33]:
# Evaluation metrics for Naive Bayes - Test Set


eval_metrics(y_test, nb_test_pred)

Confusion Matrix
[[95 35]
 [22 62]]
Othere Accuracy Metrics
accuracy: 73.4%
              precision    recall  f1-score   support

         0.0       0.73      0.81      0.77       117
         1.0       0.74      0.64      0.69        97

    accuracy                           0.73       214
   macro avg       0.73      0.73      0.73       214
weighted avg       0.73      0.73      0.73       214



## KNN

In [0]:
# Model 5: KNN

from sklearn.neighbors import KNeighborsClassifier

k_knn = 10

model_knn =  KNeighborsClassifier(n_neighbors=k_knn)
model_knn.fit(X_training, y_training.values.ravel())

knn_training_pred = model_knn.predict(X_training)
knn_test_pred = model_knn.predict(X_test)

# Grid search for optimal k_knn
# for k in 1:20
#       .....

In [35]:
# Evaluation metrics for KNN - Training Set

eval_metrics(y_training, knn_training_pred)

Confusion Matrix
[[476  47]
 [140 193]]
Othere Accuracy Metrics
accuracy: 78.2%
              precision    recall  f1-score   support

         0.0       0.91      0.77      0.84       616
         1.0       0.58      0.80      0.67       240

    accuracy                           0.78       856
   macro avg       0.74      0.79      0.75       856
weighted avg       0.82      0.78      0.79       856



In [36]:
# Evaluation metrics for KNN - Test Set

eval_metrics(y_test, knn_test_pred)

Confusion Matrix
[[107  23]
 [ 48  36]]
Othere Accuracy Metrics
accuracy: 66.8%
              precision    recall  f1-score   support

         0.0       0.82      0.69      0.75       155
         1.0       0.43      0.61      0.50        59

    accuracy                           0.67       214
   macro avg       0.63      0.65      0.63       214
weighted avg       0.71      0.67      0.68       214



## SVM

In [0]:
# Model  SVM

from sklearn.svm import SVC

svm_model = SVC(gamma='auto')
svm_model.fit(X_training, y_training.values.ravel())

svm_training_pred = svm_model.predict(X_training)
svm_test_pred = svm_model.predict(X_test)


In [39]:
# Evaluation metrics for KNN - Test Set

eval_metrics(y_training, svm_training_pred)

Confusion Matrix
[[478  45]
 [120 213]]
Othere Accuracy Metrics
accuracy: 80.7%
              precision    recall  f1-score   support

         0.0       0.91      0.80      0.85       598
         1.0       0.64      0.83      0.72       258

    accuracy                           0.81       856
   macro avg       0.78      0.81      0.79       856
weighted avg       0.83      0.81      0.81       856



In [40]:
# Evaluation metrics for KNN - Test Set

eval_metrics(y_test, svm_test_pred)

Confusion Matrix
[[105  25]
 [ 41  43]]
Othere Accuracy Metrics
accuracy: 69.2%
              precision    recall  f1-score   support

         0.0       0.81      0.72      0.76       146
         1.0       0.51      0.63      0.57        68

    accuracy                           0.69       214
   macro avg       0.66      0.68      0.66       214
weighted avg       0.71      0.69      0.70       214



## XGBoost

In [0]:
import xgboost as xgb

# Model with just default settings.
xgboost_model1 = xgb.XGBClassifier()

# Can do hyperparameter search. 
# xgboost_model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)


xgboost_model1.fit(X_training, y_training.values.ravel())

xgb_training_pred = xgboost_model1.predict(X_training)
xgb_test_pred = xgboost_model1.predict(X_test)

In [0]:
# Evaluation metrics for SVM - Test Set

eval_metrics(y_training, xgb_training_pred)

In [0]:
# Evaluation metrics for SVM - Test Set

eval_metrics(y_test, xgb_test_pred)

## Neural Net

In [0]:
# Model 4: NeuralNet
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense

import pytorch as pt

# define the NN architecture in keras. 


# Architecture 1
model = keras.Sequential([keras.layers.Flatten(input_shape (28,28)),
                keras.layers.Dense(128,activation = tf.nn.sigmoid),                          
                keras.layers.Dense(10,activation = tf.nn.softmax)])
model.compile(optimizer = 'adam',loss='sparse_categorical_crossentropy',metrics =['accuracy'])
model.fit(x_train, y_train,epochs = 5)



# Architecture 2
model = Sequential()
model.add(Dense(12, input_dim=8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# compile the keras model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


# fit the keras model on the dataset
model.fit(X, y, epochs=150, batch_size=10)

# evaluate the keras model
_, accuracy = model.evaluate(X, y)
print('Accuracy: %.2f' % (accuracy*100))


# make class predictions with the model
nn_training_pred = model.predict_classes(X)
nn_test_pred = model.predict_classes(X_test)

# Comparing the Models