In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import operator as op
%matplotlib inline

#### Importing, Cleaning and Selecting Data:

In [2]:
data = pd.read_csv('Epicurious.csv')
data2 = data.drop(['calories', 'protein', 'fat', 'sodium'], axis = 1)

In [4]:
# With Nutrtional Info:

rating_corr = data.corr().rating
rating_corr = rating_corr.sort_values(ascending = False)

key_list = list(rating_corr[0:50].keys())
data = pd.DataFrame(data[key_list])

In [5]:
# Without Nutrtional Info:

rating_corr2 = data2.corr().rating
rating_corr2 = rating_corr2.sort_values(ascending = False)

key_list2 = list(rating_corr2[0:31].keys())
data2 = pd.DataFrame(data2[key_list2])

In [6]:
# Binary Classification Party:
data.loc[data['rating'] >= 4, 'binary_rating'] = 1
data.loc[data['rating'] < 4, 'binary_rating'] = 0

data2.loc[data2['rating'] >= 4, 'binary_rating'] = 1
data2.loc[data2['rating'] < 4, 'binary_rating'] = 0

#### Splitting our Training and Test Sets:

In [7]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

X1 = data2.drop(['rating', 'binary_rating'], 1)
y1 = data2.binary_rating

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = .2)

  from collections import Sequence


#### First Model (Small Dataset):

In [8]:
from sklearn.svm import SVC

svc_1 = SVC(kernel = 'linear')
svc_1.fit(X1_train, y1_train)

y1_pred_ = svc_1.predict(X1_test)

svc_1_cfmat = confusion_matrix(y1_test, y1_pred_, labels = [0, 1])
svc_1_cvscores = cross_val_score(svc_1, X1, y1, cv = 10)
svc_1_trainscore = svc_1.score(X1_train, y1_train)
svc_1_testscore = svc_1.score(X1_test, y1_test)

In [9]:
print("The coefficients are: \n {}".format(svc_1.coef_))

The coefficients are: 
 [[ 1.99860836e+00  8.59749428e-04  2.05594448e-04 -3.20998315e-04
   1.58906287e-04  1.02660349e-03  9.66509176e-05 -1.07255392e-04
   5.16524306e-04 -5.87026161e-04 -2.20392046e-04  5.77719992e-04
   5.21931170e-04  3.39467966e-04  1.05890835e-03 -9.67217955e-05
   1.99758571e+00 -4.42523820e-04  3.04112173e-04 -2.86772277e-04
  -1.16023226e-04  6.66492278e-04  6.09964607e-04  2.02119523e-04
   7.06738111e-04  4.46270631e-04  6.64260310e-04 -1.81036898e-04
   5.75005480e-04  1.29589169e-05]]


In [10]:
print("The CV Scores are: \n {}".format(svc_1_cvscores))

The CV Scores are: 
 [0.55284148 0.55383848 0.55732802 0.56530409 0.54912718 0.55062344
 0.57406484 0.53516209 0.55239521 0.55339321]


In [52]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_1_cvscores.mean(), 
                                                                          svc_1_cvscores.std()))
print("The Training score is: {}".format(svc_1_trainscore))
print("The Test score is: {}".format(svc_1_testscore))
print(svc_1_cfmat)
print("Specificity:{}".format(svc_1_cfmat[0:1, 0:1]/(svc_1_cfmat[0:1, 0:1] + svc_1_cfmat[0:1, 1:2])))
print("Sensitivity:{}".format(svc_1_cfmat[1:2, 1:2]/(svc_1_cfmat[1:2, 0:1] + svc_1_cfmat[1:2, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.5544078043071993. STD: 0.009664971552262066.
The Training score is: 0.5536437877937784
The Test score is: 0.5574669658439292
[[ 454 1401]
 [ 374 1782]]
Specificity:[[0.24474394]]
Sensitivity:[[0.82653061]]


In [12]:
print("Value Counts for Data2: {}".format(data2.rating.value_counts()))
print("Value Counts for our Binary Column: \n {}".format(data2.binary_rating.value_counts()))

Value Counts for Data2: 4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
Value Counts for our Binary Column: 
 1.0    10738
0.0     9314
Name: binary_rating, dtype: int64


> Given that our binary classes are diveded fairly evenly, we don't have to worry about class imbalance here - our main concern is the lack of correlation amongst the features and the outcome variable.  Using a binary outcome variable has helped, and it would perhaps help as well to lower our cut-off to 3.75 instead of 4, which we'll do below, after investigated our larger data frame ('data') which includes the nutrtional information and a larger amount of columns.

#### Second Mode (Large Datasetl:

In [13]:
X2 = data.drop(['rating', 'binary_rating'], 1)
y2 = data.binary_rating

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = .2)

In [14]:
svc_2 = SVC(kernel = 'linear')
svc_2.fit(X2_train, y2_train)

y2_pred_ = svc_2.predict(X2_test)

svc_2_cfmat = confusion_matrix(y2_test, y2_pred_, labels = [0, 1])
svc_2_cvscores = cross_val_score(svc_2, X2, y2, cv = 10)
svc_2_trainscore = svc_2.score(X2_train, y2_train)
svc_2_testscore = svc_2.score(X2_test, y2_test)

In [15]:
print("The coefficients are: \n {}".format(svc_2.coef_))

The coefficients are: 
 [[ 1.24847418  0.15716924  0.24507018 -0.04072991  0.15790374  0.43825547
   0.07248807 -0.02526013  0.23835764 -0.07920606 -0.07817526  0.28581627
   0.34238851  0.20884113  0.43808281  0.08278367  0.76124946 -0.2090276
   0.20526987 -0.12059186  0.04838711  0.09694742  0.24045194  0.11577093
   0.39982624  0.19171237  0.39590641 -0.08369852  0.47475643  0.11344484
   0.6539698   0.28893508  0.04175122  0.21631996  0.3363189   0.19718094
   0.33130265  0.31289277  0.33595707  0.06784213 -0.36401064  0.33032709
   0.34855891  0.23676866  0.31151587  0.09601881  0.36725806  0.17222813
  -0.0092481 ]]


In [16]:
print("The CV Scores are: \n {}".format(svc_2_cvscores))

The CV Scores are: 
 [0.56779661 0.5667996  0.55633101 0.58624128 0.56608479 0.57605985
 0.59351621 0.5446384  0.57235529 0.55289421]


In [53]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_2_cvscores.mean(), 
                                                                          svc_2_cvscores.std()))
print("The Training score is: {}".format(svc_2_trainscore))
print("The Test score is: {}".format(svc_2_testscore))
print(svc_2_cfmat)
print("Specificity:{}".format(svc_2_cfmat[0:1, 0:1]/(svc_2_cfmat[0:1, 0:1] + svc_2_cfmat[0:1, 1:2])))
print("Sensitivity:{}".format(svc_2_cfmat[1:2, 1:2]/(svc_2_cfmat[1:2, 0:1] + svc_2_cfmat[1:2, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.5682717247384779. STD: 0.014079336862241885.
The Training score is: 0.5760239386571909
The Test score is: 0.5676888556469708
[[ 773 1073]
 [ 661 1504]]
Specificity:[[0.41874323]]
Sensitivity:[[0.69468822]]


In [29]:
print("Value Counts for Data: {}".format(data.rating.value_counts()))
print("Value Counts for our Binary Column: \n {}".format(data.binary_rating.value_counts()))

Value Counts for Data: 4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
Value Counts for our Binary Column: 
 1.0    10738
0.0     9314
Name: binary_rating, dtype: int64


> This second model has a higher training score and a lower test score (the inverse of the model above).  Preferrably, we would get a better test score with the training score a bit lower.  The mean cv score on the second model is also higher while more standard deviation in comparison with the first. You would probably want to go with the model that has laest standard deviation between these folds as you would have a better idea of how it performs on new data input. I would imagine that the second model is somewhat more prone to overfitting since it classifies the positives a bit more accurately and would therefore probably over-classify new input data - especially since this type of data is pretty sparse. 

> Regarding bias, it is definitely present in this dataset given people who have choosen these recipes are more than likely to give them a rating of four or five. People tend not to choose recipes that they believe they will not like.  Given that they have most-likely choosen a recipe which seemed good in the first place, there is a distinct head-start towards a positive rating.  This would explain why the single elements of food are not necessarily the greatest indicator of a positive rating, but the fact that the recipe was selected in the first place is significantly more indicative. The value counts for the binary rating column demonstrate this: around 2600 datapoints are rated below 3.125 while the remaining 5600 are 3.125 or above.

> A way to avoid such a bias would be to distribute the recipes, and especially the foods, to people who haven't necessarily choosen them on the internet. That way, people would be rating a recipe that they had not choosen themselves, but one place in front of them (relatively) randomly. 

#### Third Model (Small Dataest):

In [54]:
data.loc[data['rating'] >= 3.5, 'binary_rating'] = 1
data.loc[data['rating'] < 3.5, 'binary_rating'] = 0

data2.loc[data2['rating'] >= 3.5, 'binary_rating'] = 1
data2.loc[data2['rating'] < 3.5, 'binary_rating'] = 0

In [55]:
X3 = data2.drop(['rating', 'binary_rating'], 1)
y3 = data2.binary_rating

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size = .2)

In [56]:
svc_3 = SVC(kernel = 'linear')
svc_3.fit(X3_train, y3_train)

y3_pred_ = svc_3.predict(X3_test)

svc_3_cfmat = confusion_matrix(y3_test, y3_pred_, labels = [0, 1])
svc_3_cvscores = cross_val_score(svc_3, X3, y3, cv = 10)
svc_3_trainscore = svc_3.score(X3_train, y3_train)
svc_3_testscore = svc_3.score(X3_test, y3_test)

In [57]:
print("The coefficients are: \n {}".format(svc_3.coef_))

The coefficients are: 
 [[-2.04935124e-05  9.75497741e-05 -2.58093008e-05 -6.86485562e-06
  -8.38012996e-05 -6.27144819e-05 -6.18147127e-05 -7.92615103e-05
  -3.06446228e-06  6.46776675e-05  6.01074262e-06 -4.08997081e-05
  -1.53747742e-04  2.86105675e-05 -1.04992195e-04 -2.21949164e-07
  -1.93808427e-05  1.13841233e-04 -9.72464017e-05 -7.21174716e-05
  -5.78212052e-05 -8.56428173e-05 -5.82905961e-05 -4.84404828e-05
  -7.70382015e-05  0.00000000e+00 -5.05779041e-05 -3.11551321e-05
  -1.60520607e-04  9.32241948e-05]]


In [58]:
print("The CV Scores are: \n {}".format(svc_3_cvscores))

The CV Scores are: 
 [0.79312064 0.79312064 0.79312064 0.79312064 0.79312064 0.79351621
 0.79351621 0.79341317 0.79341317 0.79341317]


In [59]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_3_cvscores.mean(), 
                                                                          svc_3_cvscores.std()))
print("The Training score is: {}".format(svc_3_trainscore))
print("The Test score is: {}".format(svc_3_testscore))
print(svc_3_cfmat)
print("Specificity:{}".format(svc_3_cfmat[0:1, 0:1]/(svc_3_cfmat[0:1, 0:1] + svc_3_cfmat[0:1, 1:2])))
print("Sensitivity:{}".format(svc_3_cfmat[1:2, 1:2]/(svc_3_cfmat[1:2, 0:1] + svc_3_cfmat[1:2, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.7932875130339416. STD: 0.00017064938924625704.
The Training score is: 0.7932173804625646
The Test score is: 0.793567688855647
[[   0  828]
 [   0 3183]]
Specificity:[[0.]]
Sensitivity:[[1.]]


In [70]:
print("Value Counts for our Binary Column: \n {}".format(data2.binary_rating.value_counts()))

Value Counts for our Binary Column: 
 1.0    15907
0.0     4145
Name: binary_rating, dtype: int64


This is quite overfitting and you can see from the confusion matrix that it accurately predicts all of the positives but cannot predict a single negative.  The models above are better.

#### Model 4: Large Dataset:

In [61]:
X4= data.drop(['rating', 'binary_rating'], 1)
y4= data.binary_rating

X4_train, X4_test, y4_train, y4_test = train_test_split(X4, y4, test_size = .2)

In [62]:
svc_4 = SVC(kernel = 'linear')
svc_4.fit(X4_train, y4_train)

y4_pred_ = svc_4.predict(X4_test)

svc_4_cfmat = confusion_matrix(y4_test, y4_pred_, labels = [0, 1])
svc_4_cvscores = cross_val_score(svc_4, X4, y4, cv = 10)
svc_4_trainscore = svc_4.score(X4_train, y4_train)
svc_4_testscore = svc_4.score(X4_test, y4_test)

In [63]:
print("The coefficients are: \n {}".format(svc_4.coef_))

The coefficients are: 
 [[-1.94446192e-05  2.48987730e-05  7.61503003e-05 -4.12478901e-05
  -3.03689424e-05  1.37543224e-05 -1.85004142e-05 -3.37188868e-05
  -5.03534927e-05  8.82173318e-05 -5.40017646e-06 -1.35280422e-04
  -2.17738830e-04  1.82102590e-05  4.53181033e-06 -5.17545988e-05
   2.57162075e-05  3.60692191e-06 -4.18469303e-05 -5.21892699e-05
  -1.99512718e-04 -1.48084041e-04 -1.77163532e-05 -9.68824548e-05
   7.46144469e-05 -9.51424175e-05  1.85763834e-06  7.29908088e-05
  -4.62212743e-05 -1.55326413e-05 -5.57243860e-06 -2.39195940e-06
   3.15921653e-06 -2.80248264e-05 -4.73412330e-05 -8.41629069e-05
  -4.24725257e-05 -2.18802853e-04  2.72503526e-04 -9.11411913e-05
  -1.27997084e-04 -5.51753333e-05 -1.53215097e-04  6.19080035e-05
  -3.39788614e-04  3.12158791e-05 -2.29209481e-04  3.07804763e-05
  -1.42308639e-04]]


In [64]:
print("The CV Scores are: \n {}".format(svc_4_cvscores))

The CV Scores are: 
 [0.79312064 0.79312064 0.79312064 0.79312064 0.79312064 0.79351621
 0.79351621 0.79341317 0.79341317 0.79341317]


In [71]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_4_cvscores.mean(), 
                                                                          svc_4_cvscores.std()))
print("The Training score is: {}".format(svc_4_trainscore))
print("The Test score is: {}".format(svc_4_testscore))
print(svc_4_cfmat)
print("Specificity:{}".format(svc_4_cfmat[0:1, 0:1]/(svc_4_cfmat[0:1, 0:1] + svc_4_cfmat[0:1, 1:2])))
print("Sensitivity:{}".format(svc_4_cfmat[1:2, 1:2]/(svc_4_cfmat[1:2, 0:1] + svc_4_cfmat[1:2, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.7932875130339416. STD: 0.00017064938924625704.
The Training score is: 0.7913471728695218
The Test score is: 0.8010471204188482
[[   0  798]
 [   0 3213]]
Specificity:[[0.]]
Sensitivity:[[1.]]


In [72]:
print("Value Counts for Data: {}".format(data.rating.value_counts()))
print("Value Counts for our Binary Column: \n {}".format(data.binary_rating.value_counts()))

Value Counts for Data: 4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
Value Counts for our Binary Column: 
 1.0    15907
0.0     4145
Name: binary_rating, dtype: int64


Same here - too much overfitting and not as good as the models above.