In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import statsmodels.api as sm
import statsmodels.formula.api as smf

sns.set(style='ticks', palette='Set2')
%matplotlib inline 

In [2]:
#Import the OJ data set. 

df_oj_raw = pd.read_csv('OJ.csv')
print(df_oj_raw.head())
print(df_oj_raw.shape)

   Unnamed: 0 Purchase  WeekofPurchase  StoreID  PriceCH  PriceMM  DiscCH  \
0           1       CH             237        1     1.75     1.99    0.00   
1           2       CH             239        1     1.75     1.99    0.00   
2           3       CH             245        1     1.86     2.09    0.17   
3           4       MM             227        1     1.69     1.69    0.00   
4           5       CH             228        7     1.69     1.69    0.00   

   DiscMM  SpecialCH  SpecialMM   LoyalCH  SalePriceMM  SalePriceCH  \
0     0.0          0          0  0.500000         1.99         1.75   
1     0.3          0          1  0.600000         1.69         1.75   
2     0.0          0          0  0.680000         2.09         1.69   
3     0.0          0          0  0.400000         1.69         1.69   
4     0.0          0          0  0.956535         1.69         1.69   

   PriceDiff Store7  PctDiscMM  PctDiscCH  ListPriceDiff  STORE  
0       0.24     No   0.000000   0.000000   

In [3]:
df_oj_raw.columns.values

array(['Unnamed: 0', 'Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH',
       'PriceMM', 'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH',
       'SalePriceMM', 'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM',
       'PctDiscCH', 'ListPriceDiff', 'STORE'], dtype=object)

In [4]:
df_oj = df_oj_raw.drop(columns=['Unnamed: 0'])
df_oj.head()

# Determine which variables are Categorical and which are continuous.

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


In [5]:
print(df_oj.shape)

(1070, 18)


In [6]:
df_oj.columns.values

array(['Purchase', 'WeekofPurchase', 'StoreID', 'PriceCH', 'PriceMM',
       'DiscCH', 'DiscMM', 'SpecialCH', 'SpecialMM', 'LoyalCH',
       'SalePriceMM', 'SalePriceCH', 'PriceDiff', 'Store7', 'PctDiscMM',
       'PctDiscCH', 'ListPriceDiff', 'STORE'], dtype=object)

In [7]:
df_oj.describe()

# Note non-numeric columns were dropped. 

Unnamed: 0,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
count,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0,1070.0
mean,254.381308,3.959813,1.867421,2.085411,0.05186,0.123364,0.147664,0.161682,0.565782,1.962047,1.815561,0.146486,0.059298,0.027314,0.217991,1.630841
std,15.558286,2.308984,0.10197,0.134386,0.117474,0.213834,0.354932,0.368331,0.307843,0.252697,0.143384,0.271563,0.10176,0.062232,0.107535,1.430387
min,227.0,1.0,1.69,1.69,0.0,0.0,0.0,0.0,1.1e-05,1.19,1.39,-0.67,0.0,0.0,0.0,0.0
25%,240.0,2.0,1.79,1.99,0.0,0.0,0.0,0.0,0.325257,1.69,1.75,0.0,0.0,0.0,0.14,0.0
50%,257.0,3.0,1.86,2.09,0.0,0.0,0.0,0.0,0.6,2.09,1.86,0.23,0.0,0.0,0.24,2.0
75%,268.0,7.0,1.99,2.18,0.0,0.23,0.0,0.0,0.850873,2.13,1.89,0.32,0.112676,0.0,0.3,3.0
max,278.0,7.0,2.09,2.29,0.5,0.8,1.0,1.0,0.999947,2.29,2.09,0.64,0.40201,0.252688,0.44,4.0


In [8]:
df_oj.head()

Unnamed: 0,Purchase,WeekofPurchase,StoreID,PriceCH,PriceMM,DiscCH,DiscMM,SpecialCH,SpecialMM,LoyalCH,SalePriceMM,SalePriceCH,PriceDiff,Store7,PctDiscMM,PctDiscCH,ListPriceDiff,STORE
0,CH,237,1,1.75,1.99,0.0,0.0,0,0,0.5,1.99,1.75,0.24,No,0.0,0.0,0.24,1
1,CH,239,1,1.75,1.99,0.0,0.3,0,1,0.6,1.69,1.75,-0.06,No,0.150754,0.0,0.24,1
2,CH,245,1,1.86,2.09,0.17,0.0,0,0,0.68,2.09,1.69,0.4,No,0.0,0.091398,0.23,1
3,MM,227,1,1.69,1.69,0.0,0.0,0,0,0.4,1.69,1.69,0.0,No,0.0,0.0,0.0,1
4,CH,228,7,1.69,1.69,0.0,0.0,0,0,0.956535,1.69,1.69,0.0,Yes,0.0,0.0,0.0,0


In [9]:
# seperate the data into target and features. 

# Convert the target feature into 0 and 1 instead of CH and MM. with MM = 1 (MinuteMaid) and CH = 0.




# Create matrices
from patsy import dmatrices
y, X = dmatrices('Purchase ~ WeekofPurchase + C(StoreID) + PriceCH + PriceMM + DiscCH + DiscMM + SpecialCH + SpecialMM + LoyalCH + SalePriceMM + SalePriceCH + PriceDiff + C(Store7) + PctDiscMM + PctDiscCH + ListPriceDiff + C(STORE)', df_oj, return_type = 'dataframe')


#y = df_oj[['Purchase']]
print(y.shape)
#X = df_oj.drop(columns=['Purchase'])
print(X.shape)

(1070, 1)
(1070, 17)


In [10]:
# Seperate the data into train, validation, and test sets by doing a 60-20-20 split. 

from sklearn.model_selection import train_test_split

# First split data into Training and Test data 80-20
#
X_training, X_test, y_training, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

# Now split the Training data into Train and validation sets with 75-25 ratio so that the final split is 60-20-20. 
#
X_train, X_val, y_train, y_val = train_test_split(X_training, y_training, test_size=0.25, random_state=42)

# Print and check if things make sense
print(X.shape[0]*.6, X.shape[0]*.2)

print(X_train.shape, X_val.shape, X_test.shape)
print(y_train.shape, y_val.shape, y_test.shape)

# Note that we are doing a 60-20-20 split for ease of computation. In fact, since our data size is actually relatively small, 
# only 1070 observations, cross-validation should be done. 


642.0 214.0
(642, 17) (214, 17) (214, 17)
(642, 1) (214, 1) (214, 1)


In [11]:
# Scale the training, validation and test sets.

# Data has categorical variables, the below will not work. Need to take care of those, e.g. using embeddings, before
# scaling the data. 

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_val_s = scaler.transform(X_val)
X_test_s = scaler.transform(X_test)

print('shape for training dataset:', X_train_s.shape)
print('shape for validation dataset:', X_train_s.shape)
print('shape for testing dataset:', X_test_s.shape)




ValueError: could not convert string to float: 'No'

In [12]:
# This MinMax method perhaps is not as appropriate as the normalization method. 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_ss = scaler.fit_transform(X_train)
X_val_ss = scaler.transform(X_val)
X_test_ss = scaler.transform(X_test)

print('shape for training dataset:', X_train.shape)
print('shape for validation dataset:', X_val.shape)
print('shape for testing dataset:', X_test.shape)

ValueError: could not convert string to float: 'No'

In [19]:
# Build models with Purchase as the target feature. 

# Model 1: Logistic regression model
# Model 2: RandomForest model
# Model 3: XGboost model
# Model 4: NeuralNet
# Model 5: KNN, Naive Bayes, SVM

print(X_train.columns.values)
#X_train.head()

['WeekofPurchase' 'StoreID' 'PriceCH' 'PriceMM' 'DiscCH' 'DiscMM'
 'SpecialCH' 'SpecialMM' 'LoyalCH' 'SalePriceMM' 'SalePriceCH' 'PriceDiff'
 'Store7' 'PctDiscMM' 'PctDiscCH' 'ListPriceDiff' 'STORE']


In [16]:
import re

fm = re.sub(' ', '+', X_train.columns.values)
print(fm)

[re.sub("(,[ ]*!.*)$", "", x) for x in strings]

TypeError: cannot use a string pattern on a bytes-like object

In [17]:
import re
strings = ["Important text,      !Comment that could be removed", "Other String"]
[re.sub("(,[ ]*!.*)$", "", x) for x in strings]

['Important text', 'Other String']

In [24]:
from patsy import dmatrices

dff = pd.read_csv("https://stats.idre.ucla.edu/stat/data/binary.csv")
print(dff.head())
yy, XX = dmatrices('admit ~ gre + gpa + C(rank)', dff, return_type = 'dataframe')
XX.head()
yy.head()

   admit  gre   gpa  rank
0      0  380  3.61     3
1      1  660  3.67     3
2      1  800  4.00     1
3      1  640  3.19     4
4      0  520  2.93     4


Unnamed: 0,admit
0,0.0
1,1.0
2,1.0
3,1.0
4,0.0


In [None]:
# Model 1: Logistic regression model

#from sklearn.linear_model import LogisticRegression
#clf = LogisticRegression(random_state=0, solver='lbfgs',
#                         multi_class='multinomial').fit(X_train, y_train)

#clf.predict(X_train)


# module imports
from sklearn.linear_model import LogisticRegression
import statsmodels.discrete.discrete_model as sm

# sklearn output
model = LogisticRegression(fit_intercept = False, C = 1e9, random_state=0)
mdl = model.fit(X_train, y_train)
model.coef_

# sm
logit = sm.Logit(y_train, X_train)
logit.fit().params



In [None]:
# Model 2: RandomForest model


In [None]:
# Model 3: XGboost model
 

In [None]:
# Model 4: NeuralNet


In [None]:
# Model 5:

