In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [2]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

bin_ft = list(raw_data.columns)
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')

missing = []
for row in raw_data.index:      #Find recipes missing from full-form set
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row) 
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:             #Drop the recipes without full-form available
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:      #Record indices for each full-form recipe in order of csv
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])

#Grab wanted data from complete recipes:
categories = []
directions = []
ingredients = []                
for i in full_ind:
    row = full.loc[i]
    cat, dr, ing = row.categories, row.directions, row.ingredients
    categories.append(cat)                          #List of categories
    directions.append(dr)                           #List of directions
    ingredients.append(ing)                         #List of ingredients
    
raw_data['full_cats'] = categories                  #append to our df
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)
raw_data['no_stars'] = np.where(raw_data.rating==0, 1, 0)

dropping 5 rows...


In [3]:
#Basic features: lengths of ingredients list, instruction steps, complete instructions
def get_length(lst):
    length = 0
    for step in lst:
        length += len(step)
    return length

#consolidates directions from list to single string
def get_full_dir(lst):
    doc = ''
    for step in lst:
        doc += step + ' '
    return doc

#Build features
raw_data['n_ingredients'] = raw_data.full_ingr.apply(len)
raw_data.n_ingredients = raw_data.n_ingredients/raw_data.n_ingredients.max()
raw_data['n_steps'] = raw_data.full_dir.apply(len)
raw_data.n_steps = raw_data.n_steps/raw_data.n_steps.max()
raw_data['n_words'] = raw_data.full_dir.apply(get_length)
raw_data.n_words = raw_data.n_words/raw_data.n_words.max()
raw_data['dir_doc'] = raw_data.full_dir.apply(get_full_dir)

In [4]:
#Build tfid vectorizer
vctzr = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
vctzr.build_analyzer()

#Train on recipe corpus
vctzr = vctzr.fit(raw_data.dir_doc)

#Get sparse matrix of recipe text features
A = vctzr.transform(raw_data.dir_doc)
X = A
Y = raw_data['no_stars']

#Use random forest classifier to select most important models
clf = ExtraTreesClassifier(class_weight='balanced')
clf.fit(X, Y)

sel = SelectFromModel(clf, prefit=True, threshold=.0035)
print(X.shape)
X_txt = sel.transform(X)
X_txt.shape

txt_map = list(sel.transform(vctzr.get_feature_names()).ravel())
print('most important recipe text features: ', txt_map)
    

(20047, 5000)
most important recipe text features:  ['and pepper', 'cocktail glass', 'heavy large', 'inch', 'into cocktail', 'let cool', 'minutes', 'pepper', 'salt', 'salt and', 'sprinkle', 'stand', 'the', 'until', 'with']




In [5]:
#Find most important features from categories
X = raw_data[bin_ft]

clf = ExtraTreesClassifier(class_weight='balanced')
clf.fit(X, Y)
sel = SelectFromModel(clf, prefit=True, threshold=.008)
print(len(X.columns))
X_cts = sel.transform(X)
print(X_cts.shape)

#Find the corresponding categories for selected features
keepers = sel.transform(range(0, len(bin_ft)))

cts_map = []
print('most important categories:')
for num in keepers.ravel():
    cts_map.append(bin_ft[int(num)])
    print(bin_ft[int(num)])

674
(20047, 19)
most important categories:
alcoholic
bake
bon appétit
cocktail party
drink
fall
gin
gourmet
house & garden
kid-friendly
onion
peanut free
quick & easy
sauce
sauté
summer
vegetable
vegetarian
winter




In [6]:
#Combine all the best features
truca = pd.concat([pd.DataFrame(data=X_txt.todense(), columns=txt_map, index=raw_data.index), pd.DataFrame(data=X_cts, columns=cts_map, index=raw_data.index)], axis=1)
truca = pd.concat([truca, raw_data[['n_steps', 'n_words', 'n_ingredients']]], axis=1)

X = truca
Y = raw_data['no_stars']

svc = SVC(class_weight='balanced')
cross_val_score(svc, X, Y, cv=5)

array([ 0.78004988,  0.79002494,  0.77625343,  0.79047144,  0.76677476])

In [7]:
#Get sparse matrix of recipe text features

A = vctzr.transform(raw_data.dir_doc)
X = A
Y = raw_data['four_stars']

#Use random forest classifier to select most important models
clf = ExtraTreesClassifier(class_weight='balanced')
clf.fit(X, Y)

sel = SelectFromModel(clf, prefit=True, threshold=.0011)
print(X.shape)
X_txt = sel.transform(X)
X_txt.shape

txt_map = list(sel.transform(vctzr.get_feature_names()).ravel())
print('most important recipe text features: ', txt_map)

(20047, 5000)
most important recipe text features:  ['about', 'add', 'cocktail', 'glass', 'in', 'ingredients', 'into cocktail glass', 'minutes', 'salt', 'stir', 'strain into cocktail', 'the', 'to', 'using']




In [8]:
#Find most important features from categories
X = raw_data[bin_ft]

clf = ExtraTreesClassifier(class_weight='balanced')
clf.fit(X, Y)
sel = SelectFromModel(clf, prefit=True, threshold=.008)
print(len(X.columns))
X_cts = sel.transform(X)
print(X_cts.shape)

#Find the corresponding categories for selected features
keepers = sel.transform(range(0, len(bin_ft)))

cts_map = []
print('most important categories:')
for num in keepers.ravel():
    cts_map.append(bin_ft[int(num)])
    print(bin_ft[int(num)])

674
(20047, 15)
most important categories:
bake
bon appétit
dairy
fall
gourmet
herb
kid-friendly
onion
quick & easy
spring
summer
tomato
vegetarian
wheat/gluten-free
winter




In [9]:
#Combine all the best features
truca = pd.concat([pd.DataFrame(data=X_txt.todense(), columns=txt_map, index=raw_data.index), pd.DataFrame(data=X_cts, columns=cts_map, index=raw_data.index)], axis=1)
truca = pd.concat([truca, raw_data[['n_steps', 'n_words', 'n_ingredients']]], axis=1)

X = truca
Y = raw_data['four_stars']

svc = SVC()
#cross_val_score(svc, X, Y, cv=5)

In [34]:
mst = pd.concat([truca, raw_data['four_stars']], axis=1)
samp = mst.sample(frac=.25)
fts = list(samp.columns)
fts.remove('four_stars')

X=samp[fts]
Y=samp['four_stars']

svc = SVC()
prms = {
    'C' : [3.945, 3.95, 3.955],
    'gamma' : [.0475, .045, .0425]
}
srch = GridSearchCV(svc, prms, cv=5)

srch.fit(X, Y)



GridSearchCV(cv=5, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [3.945, 3.95, 3.955], 'gamma': [0.0475, 0.045, 0.0425]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [36]:
srch.score(mst[fts], mst['four_stars'])

0.57569711178729988

In [35]:
srch.best_params_

{'C': 3.945, 'gamma': 0.0425}

In [37]:
svc = SVC(C=3.945, gamma=.0425)
cross_val_score(svc, mst[fts], mst['four_stars'], cv=5)

array([ 0.5719272 ,  0.57844849,  0.56198553,  0.57919681,  0.57046645])

In [14]:
pd.Series([ 0.60383944,  0.59790471,  0.6081317 ,  0.59940135,  0.60089798]).std()

0.004053207892648741

In [15]:
mst = pd.concat([truca, raw_data['four_stars']], axis=1)
samp = mst.sample(frac=.2)
fts = list(samp.columns)
fts.remove('four_stars')

X=samp[fts]
Y=samp['four_stars']

svc = SVC(kernel='poly')
prms = {
    'C' : [1, 5, 10, 15],
    'gamma' : [1, .1, .01],
    'degree': [1, 2, 3]
}
srch = GridSearchCV(svc, prms, cv=5)

srch.fit(X, Y)

print(srch.best_params_)
srch.best_score_

{'C': 5, 'degree': 1, 'gamma': 0.1}


0.57395859316537789

In [20]:
svc_poly = SVC(degree=1, C=5, gamma=.1)
cross_val_score(svc_poly, mst[fts], mst['four_stars'], cv=5)

array([ 0.57591623,  0.57695186,  0.56847094,  0.58169119,  0.56921926])

In [44]:
mst = pd.concat([truca, raw_data['four_stars']], axis=1)
samp = mst.sample(frac=.2)
fts = list(samp.columns)
fts.remove('four_stars')

X=samp[fts]
Y=samp['four_stars']

svc = SVC(kernel='sigmoid')
prms = {
    'C' : [850, 800],
    'gamma' : [.0103, .0102]
}
srch = GridSearchCV(svc, prms, cv=5)

srch.fit(X, Y)

print(srch.best_params_)
srch.best_score_

{'C': 850, 'gamma': 0.0103}


0.56572711399351461