In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import operator as op
%matplotlib inline

#### Importing, Cleaning and Selecting Data:

In [2]:
data = pd.read_csv('Epicurious.csv')
data2 = data.drop(['calories', 'protein', 'fat', 'sodium'], axis = 1)

In [4]:
# With Nutrtional Info:

rating_corr = data.corr().rating
rating_corr = rating_corr.sort_values(ascending = False)

key_list = list(rating_corr[0:50].keys())
data = pd.DataFrame(data[key_list])

In [5]:
# Without Nutrtional Info:

rating_corr2 = data2.corr().rating
rating_corr2 = rating_corr2.sort_values(ascending = False)

key_list2 = list(rating_corr2[0:31].keys())
data2 = pd.DataFrame(data2[key_list2])

In [6]:
# Binary Classification Party:
data.loc[data['rating'] >= 4, 'binary_rating'] = 1
data.loc[data['rating'] < 4, 'binary_rating'] = 0

data2.loc[data2['rating'] >= 4, 'binary_rating'] = 1
data2.loc[data2['rating'] < 4, 'binary_rating'] = 0

#### Splitting our Training and Test Sets:

In [7]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import confusion_matrix

X1 = data2.drop(['rating', 'binary_rating'], 1)
y1 = data2.binary_rating

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size = .2)

  from collections import Sequence


#### First Model:

In [8]:
from sklearn.svm import SVC

svc_1 = SVC(kernel = 'linear')
svc_1.fit(X1_train, y1_train)

y1_pred_ = svc_1.predict(X1_test)

svc_1_cfmat = confusion_matrix(y1_test, y1_pred_, labels = [0, 1])
svc_1_cvscores = cross_val_score(svc_1, X1, y1, cv = 10)
svc_1_trainscore = svc_1.score(X1_train, y1_train)
svc_1_testscore = svc_1.score(X1_test, y1_test)

In [9]:
print("The coefficients are: \n {}".format(svc_1.coef_))

The coefficients are: 
 [[ 1.99860836e+00  8.59749428e-04  2.05594448e-04 -3.20998315e-04
   1.58906287e-04  1.02660349e-03  9.66509176e-05 -1.07255392e-04
   5.16524306e-04 -5.87026161e-04 -2.20392046e-04  5.77719992e-04
   5.21931170e-04  3.39467966e-04  1.05890835e-03 -9.67217955e-05
   1.99758571e+00 -4.42523820e-04  3.04112173e-04 -2.86772277e-04
  -1.16023226e-04  6.66492278e-04  6.09964607e-04  2.02119523e-04
   7.06738111e-04  4.46270631e-04  6.64260310e-04 -1.81036898e-04
   5.75005480e-04  1.29589169e-05]]


In [10]:
print("The CV Scores are: \n {}".format(svc_1_cvscores))

The CV Scores are: 
 [0.55284148 0.55383848 0.55732802 0.56530409 0.54912718 0.55062344
 0.57406484 0.53516209 0.55239521 0.55339321]


In [30]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_1_cvscores.mean(), 
                                                                          svc_1_cvscores.std()))
print("The Training score is: {}".format(svc_1_trainscore))
print("The Test score is: {}".format(svc_1_testscore))
print(svc_1_cfmat)
print("Specificity:{}".format(svc_1_cfmat[1:2, 1:2]/(svc_1_cfmat[1:2, 0:1] + svc_1_cfmat[1:2, 1:2])))
print("Sensitivity:{}".format(svc_1_cfmat[0:1, 0:1]/(svc_1_cfmat[0:1, 0:1] + svc_1_cfmat[0:1, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.5544078043071993. STD: 0.009664971552262066.
The Training score is: 0.5536437877937784
The Test score is: 0.5574669658439292
[[ 454 1401]
 [ 374 1782]]
Specificity:[[0.82653061]]
Sensitivity:[[0.24474394]]


In [12]:
print("Value Counts for Data2: {}".format(data2.rating.value_counts()))
print("Value Counts for our Binary Column: \n {}".format(data2.binary_rating.value_counts()))

Value Counts for Data2: 4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
Value Counts for our Binary Column: 
 1.0    10738
0.0     9314
Name: binary_rating, dtype: int64


#### Second Model:

In [13]:
X2 = data.drop(['rating', 'binary_rating'], 1)
y2 = data.binary_rating

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size = .2)

In [14]:
svc_2 = SVC(kernel = 'linear')
svc_2.fit(X2_train, y2_train)

y2_pred_ = svc_2.predict(X2_test)

svc_2_cfmat = confusion_matrix(y2_test, y2_pred_, labels = [0, 1])
svc_2_cvscores = cross_val_score(svc_2, X2, y2, cv = 10)
svc_2_trainscore = svc_2.score(X2_train, y2_train)
svc_2_testscore = svc_2.score(X2_test, y2_test)

In [15]:
print("The coefficients are: \n {}".format(svc_2.coef_))

The coefficients are: 
 [[ 1.24847418  0.15716924  0.24507018 -0.04072991  0.15790374  0.43825547
   0.07248807 -0.02526013  0.23835764 -0.07920606 -0.07817526  0.28581627
   0.34238851  0.20884113  0.43808281  0.08278367  0.76124946 -0.2090276
   0.20526987 -0.12059186  0.04838711  0.09694742  0.24045194  0.11577093
   0.39982624  0.19171237  0.39590641 -0.08369852  0.47475643  0.11344484
   0.6539698   0.28893508  0.04175122  0.21631996  0.3363189   0.19718094
   0.33130265  0.31289277  0.33595707  0.06784213 -0.36401064  0.33032709
   0.34855891  0.23676866  0.31151587  0.09601881  0.36725806  0.17222813
  -0.0092481 ]]


In [16]:
print("The CV Scores are: \n {}".format(svc_2_cvscores))

The CV Scores are: 
 [0.56779661 0.5667996  0.55633101 0.58624128 0.56608479 0.57605985
 0.59351621 0.5446384  0.57235529 0.55289421]


In [31]:
print("The Mean and STD of the CV Scores are:\n Mean:{}. STD: {}.".format(svc_2_cvscores.mean(), 
                                                                          svc_2_cvscores.std()))
print("The Training score is: {}".format(svc_2_trainscore))
print("The Test score is: {}".format(svc_2_testscore))
print(svc_2_cfmat)
print("Specificity:{}".format(svc_2_cfmat[1:2, 1:2]/(svc_2_cfmat[1:2, 0:1] + svc_2_cfmat[1:2, 1:2])))
print("Sensitivity:{}".format(svc_2_cfmat[0:1, 0:1]/(svc_2_cfmat[0:1, 0:1] + svc_2_cfmat[0:1, 1:2])))

The Mean and STD of the CV Scores are:
 Mean:0.5682717247384779. STD: 0.014079336862241885.
The Training score is: 0.5760239386571909
The Test score is: 0.5676888556469708
[[ 773 1073]
 [ 661 1504]]
Specificity:[[0.69468822]]
Sensitivity:[[0.41874323]]


In [29]:
print("Value Counts for Data: {}".format(data.rating.value_counts()))
print("Value Counts for our Binary Column: \n {}".format(data.binary_rating.value_counts()))

Value Counts for Data: 4.375    8019
3.750    5169
5.000    2719
0.000    1836
3.125    1489
2.500     532
1.250     164
1.875     124
Name: rating, dtype: int64
Value Counts for our Binary Column: 
 1.0    10738
0.0     9314
Name: binary_rating, dtype: int64


> Based on these correlation maps, you can see that there is definitely a bias here: people who have choosen these recipes are more than likely to give them a rating of four or five. People tend not to choose recipes that they believe they will not like.  Given that they have most-likely choosen a recipe which seemed good in the first place, there is a distinct head-start towards a positive rating.  This would explain why the single elements of food are not necessarily the greatest indicator of a positive rating, but the fact that the recipe was selected in the first place is significantly more indicative.

> A way to avoid such a bias would be to distribute the recipes, and especially the foods, to people who haven't necessarily choosen them on the internet. That way, people would be rating a recipe that they had not choosen themselves, but one place in front of them (relatively) randomly. 

> Given that our binary classes are diveded fairly evenly, we don't have to worry about class imbalance here - our main concern is the lack of correlation amongst the features and the outcome variable.