In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import svm
from sklearn.model_selection import cross_val_score
import time

%matplotlib inline

Now it's time for another guided example. This time we're going to look at recipes. Specifically we'll use the epicurious dataset, which has a collection of recipes, key terms and ingredients, and their ratings.

What we want to see is if we can use the ingredient and keyword list to predict the rating. For someone writing a cookbook this could be really useful information that could help them choose which recipes to include because they're more likely to be enjoyed and therefore make the book more likely to be successful.

First let's load the dataset. It's [available on Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). We'll use the csv file here and as pull out column names and some summary statistics for ratings.

In [3]:
df_recipe = pd.read_csv('https://tf-assets-prod.s3.amazonaws.com/tf-curric/data-science/epi_r.csv')

In [4]:
df_recipe.head(3)

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
list(df_recipe.columns)

['title',
 'rating',
 'calories',
 'protein',
 'fat',
 'sodium',
 '#cakeweek',
 '#wasteless',
 '22-minute meals',
 '3-ingredient recipes',
 '30 days of groceries',
 'advance prep required',
 'alabama',
 'alaska',
 'alcoholic',
 'almond',
 'amaretto',
 'anchovy',
 'anise',
 'anniversary',
 'anthony bourdain',
 'aperitif',
 'appetizer',
 'apple',
 'apple juice',
 'apricot',
 'arizona',
 'artichoke',
 'arugula',
 'asian pear',
 'asparagus',
 'aspen',
 'atlanta',
 'australia',
 'avocado',
 'back to school',
 'backyard bbq',
 'bacon',
 'bake',
 'banana',
 'barley',
 'basil',
 'bass',
 'bastille day',
 'bean',
 'beef',
 'beef rib',
 'beef shank',
 'beef tenderloin',
 'beer',
 'beet',
 'bell pepper',
 'berry',
 'beverly hills',
 'birthday',
 'biscuit',
 'bitters',
 'blackberry',
 'blender',
 'blue cheese',
 'blueberry',
 'boil',
 'bok choy',
 'bon appétit',
 'bon app��tit',
 'boston',
 'bourbon',
 'braise',
 'bran',
 'brandy',
 'bread',
 'breadcrumbs',
 'breakfast',
 'brie',
 'brine',
 'brisk

In [5]:
df_recipe.shape

(20052, 680)

In [6]:
df_recipe.index

RangeIndex(start=0, stop=20052, step=1)

In [7]:
df_recipe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 104.0+ MB


In [8]:
#Number of non-NA values
df_recipe.count()

title                    20052
rating                   20052
calories                 15935
protein                  15890
fat                      15869
sodium                   15933
#cakeweek                20052
#wasteless               20052
22-minute meals          20052
3-ingredient recipes     20052
30 days of groceries     20052
advance prep required    20052
alabama                  20052
alaska                   20052
alcoholic                20052
almond                   20052
amaretto                 20052
anchovy                  20052
anise                    20052
anniversary              20052
anthony bourdain         20052
aperitif                 20052
appetizer                20052
apple                    20052
apple juice              20052
apricot                  20052
arizona                  20052
artichoke                20052
arugula                  20052
asian pear               20052
                         ...  
walnut                   20052
wasabi  

In [9]:
# Count nulls 
df_recipe.isnull().sum()

title                       0
rating                      0
calories                 4117
protein                  4162
fat                      4183
sodium                   4119
#cakeweek                   0
#wasteless                  0
22-minute meals             0
3-ingredient recipes        0
30 days of groceries        0
advance prep required       0
alabama                     0
alaska                      0
alcoholic                   0
almond                      0
amaretto                    0
anchovy                     0
anise                       0
anniversary                 0
anthony bourdain            0
aperitif                    0
appetizer                   0
apple                       0
apple juice                 0
apricot                     0
arizona                     0
artichoke                   0
arugula                     0
asian pear                  0
                         ... 
walnut                      0
wasabi                      0
washington

In [10]:
# Count nulls 
null_count = df_recipe.isnull().sum()
null_count[null_count>0]

calories    4117
protein     4162
fat         4183
sodium      4119
dtype: int64

In [11]:
df_recipe.rating.describe()

count    20052.000000
mean         3.714467
std          1.340829
min          0.000000
25%          3.750000
50%          4.375000
75%          4.375000
max          5.000000
Name: rating, dtype: float64

In [12]:
set(list(df_recipe.columns))

{'#cakeweek',
 '#wasteless',
 '22-minute meals',
 '3-ingredient recipes',
 '30 days of groceries',
 'advance prep required',
 'alabama',
 'alaska',
 'alcoholic',
 'almond',
 'amaretto',
 'anchovy',
 'anise',
 'anniversary',
 'anthony bourdain',
 'aperitif',
 'appetizer',
 'apple',
 'apple juice',
 'apricot',
 'arizona',
 'artichoke',
 'arugula',
 'asian pear',
 'asparagus',
 'aspen',
 'atlanta',
 'australia',
 'avocado',
 'back to school',
 'backyard bbq',
 'bacon',
 'bake',
 'banana',
 'barley',
 'basil',
 'bass',
 'bastille day',
 'bean',
 'beef',
 'beef rib',
 'beef shank',
 'beef tenderloin',
 'beer',
 'beet',
 'bell pepper',
 'berry',
 'beverly hills',
 'birthday',
 'biscuit',
 'bitters',
 'blackberry',
 'blender',
 'blue cheese',
 'blueberry',
 'boil',
 'bok choy',
 'bon appétit',
 'bon app��tit',
 'boston',
 'bourbon',
 'braise',
 'bran',
 'brandy',
 'bread',
 'breadcrumbs',
 'breakfast',
 'brie',
 'brine',
 'brisket',
 'broccoli',
 'broccoli rabe',
 'broil',
 'brooklyn',
 'brow

In [13]:
set(['rating', 'title'])


{'rating', 'title'}

In [14]:
X = df_recipe[list(set(list(df_recipe.columns)) - set(['rating', 'title']))]
y= df_recipe['rating']

In [15]:
X.head(2)

Unnamed: 0,raw,paris,beet,sandwich,onion,almond,portland,smoker,flaming hot summer,pot pie,...,michigan,squash,blue cheese,houston,thyme,rack of lamb,port,arugula,wisconsin,cognac/armagnac
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [16]:
#Classifies rating, >4 = 1 (high rating) & <4 = 0 (Low rating).
y_class = y.copy()
y_class[y_class < 4] = 0
y_class[y_class > 4] = 1
y_class.value_counts()

1.0    10738
0.0     9314
Name: rating, dtype: int64

In [17]:
y_class

0        0.0
1        1.0
2        0.0
3        1.0
4        0.0
5        1.0
6        1.0
7        0.0
8        1.0
9        0.0
10       0.0
11       1.0
12       1.0
13       1.0
14       0.0
15       1.0
16       0.0
17       1.0
18       1.0
19       1.0
20       1.0
21       0.0
22       0.0
23       0.0
24       0.0
25       0.0
26       0.0
27       0.0
28       1.0
29       1.0
        ... 
20022    1.0
20023    0.0
20024    1.0
20025    1.0
20026    1.0
20027    1.0
20028    0.0
20029    0.0
20030    0.0
20031    1.0
20032    1.0
20033    1.0
20034    1.0
20035    0.0
20036    1.0
20037    0.0
20038    0.0
20039    0.0
20040    1.0
20041    0.0
20042    0.0
20043    1.0
20044    0.0
20045    0.0
20046    0.0
20047    0.0
20048    1.0
20049    1.0
20050    1.0
20051    1.0
Name: rating, Length: 20052, dtype: float64

In [18]:
X.describe()

Unnamed: 0,raw,paris,beet,sandwich,onion,almond,portland,smoker,flaming hot summer,pot pie,...,michigan,squash,blue cheese,houston,thyme,rack of lamb,port,arugula,wisconsin,cognac/armagnac
count,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
mean,0.00389,0.000199,0.010473,0.021345,0.11161,0.029124,0.000748,0.000349,0.000598,0.0001,...,0.001197,0.00753,0.01172,5e-05,0.016607,0.001446,0.005486,0.013864,0.000199,0.004538
std,0.062249,0.014123,0.101802,0.144534,0.314894,0.168159,0.027341,0.018681,0.024456,0.009987,...,0.034576,0.086453,0.107623,0.007062,0.127796,0.038003,0.073864,0.116929,0.014123,0.067215
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [19]:
# Count nulls 
null_count = df_recipe.isnull().sum()
col_null = null_count[null_count>0]
col_null

calories    4117
protein     4162
fat         4183
sodium      4119
dtype: int64

In [20]:
# Drop NAN, columns['protein', 'calories', 'fat', 'sodium']
X_null = X.isnull().sum()
null_cols = list(X_null[X_null > 0].index)
X_clean = X.drop(columns = null_cols)

In [21]:
# Drop NAN, columns['protein', 'calories', 'fat', 'sodium']
X_null = X.isnull().sum()
null_cols 


['sodium', 'calories', 'fat', 'protein']

In [22]:
#Check
null_count = X_clean.isnull().sum()
col_null = null_count[null_count>0]
col_null

Series([], dtype: int64)

In [23]:
#Drop skewed means
low_means = []
for col in X_clean.columns:
    if X_clean[col].mean() < 0.1:
        low_means.append(col)
print("Dropping {} columns due to insignificant mean".format(len(low_means)))
X_clean = X_clean.drop(columns = low_means)
       

Dropping 648 columns due to insignificant mean


In [24]:
X_clean.head()

Unnamed: 0,onion,dessert,quick & easy,tomato,gourmet,sauté,no sugar added,peanut free,side,bon appétit,...,pescatarian,soy free,wheat/gluten-free,dairy free,fall,vegetable,tree nut free,healthy,kidney friendly,dinner
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0


In [25]:
X_clean.columns

Index(['onion', 'dessert', 'quick & easy', 'tomato', 'gourmet', 'sauté',
       'no sugar added', 'peanut free', 'side', 'bon appétit', 'summer',
       'bake', 'kosher', 'vegetarian', 'winter', 'sugar conscious',
       'pescatarian', 'soy free', 'wheat/gluten-free', 'dairy free', 'fall',
       'vegetable', 'tree nut free', 'healthy', 'kidney friendly', 'dinner'],
      dtype='object')

In [26]:
# Feature list
print ("Number of features: {}".format(len(X_clean.columns)))

Number of features: 26


In [27]:
X_clean.shape

(20052, 26)

In [28]:
from sklearn.svm import SVR
svr = SVR()
X = X_clean
Y = y_class
svr.fit(X,Y)



SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
  gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
  tol=0.001, verbose=False)

In [29]:
print("score : {}".format(svr.score(X, Y)))

score : -0.12700701079231402


In [30]:
from sklearn.model_selection import cross_val_score
cross_val_score(svr, X, Y, cv=5)



array([-0.16021793, -0.17294904, -0.16748199, -0.1674532 , -0.17035568])

In [31]:
import time
#SV Classifier
start = time.time()
svc_model = svm.SVC()
fit = svc_model.fit(X_clean, y_class)
y_pred = svc_model.predict(X_clean)
print ("Runtime: %0.2f seconds" % (time.time() - start))



Runtime: 29.28 seconds


In [32]:
start = time.time()
svc_score = svc_model.score(X_clean, y_class)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Baseline Score: %0.3f" % (svc_score))

Runtime: 11.26 seconds
Baseline Score: 0.579


In [33]:
#Improving

start = time.time()
svc_iter_model = svm.SVC(C = 100)
print ("Runtime: %0.2f seconds" % (time.time() - start))

Runtime: 0.00 seconds


In [34]:
# Add nutritional information, imputing nulls with median
X_nut = df_recipe[list(set(list(df_recipe.columns)) - set(['rating', 'title']))]
X_nut.drop(columns = low_means, inplace = True)
X_nut = X_nut.fillna(X_nut.median())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [35]:
#Add nutritional information, imputing nulls w/ median
X_nut = df_recipe[list(set(list(df_recipe.columns)) - set(['rating', 'title']))]
X_nut.drop(columns = low_means, inplace = True)
X_nut = X_nut.fillna(X_nut.median())
X_nut.head()

Unnamed: 0,onion,dessert,quick & easy,tomato,gourmet,sauté,no sugar added,peanut free,side,bon appétit,...,wheat/gluten-free,fat,dairy free,protein,fall,vegetable,tree nut free,healthy,kidney friendly,dinner
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,30.0,0.0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,23.0,0.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,7.0,0.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,17.0,1.0,8.0,0.0,0.0,1.0,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,32.0,0.0,20.0,1.0,1.0,0.0,0.0,0.0,0.0


In [36]:
X_nut.shape

(20052, 30)

In [37]:
start = time.time()
svc_iter_cross_val_scores = cross_val_score(svc_iter_model, X_nut, y_class, cv=5)
print ("Runtime: %0.2f seconds" % (time.time() - start))
print ("Mean Accuracy: %0.3f (+/- %0.3f)" % (svc_iter_cross_val_scores.mean(), svc_iter_cross_val_scores.std()))



Runtime: 373.12 seconds
Mean Accuracy: 0.603 (+/- 0.007)
