In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn import random_projection
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [2]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

bin_ft = list(raw_data.columns)
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')

missing = []
for row in raw_data.index:      #Find recipes missing from full-form set
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row) 
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:             #Drop the recipes without full-form available
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:      #Record indices for each full-form recipe in order of csv
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])

#Grab wanted data from complete recipes:
categories = []
directions = []
ingredients = []                
for i in full_ind:
    row = full.loc[i]
    cat, dr, ing = row.categories, row.directions, row.ingredients
    categories.append(cat)                          #List of categories
    directions.append(dr)                           #List of directions
    ingredients.append(ing)                         #List of ingredients
    
raw_data['full_cats'] = categories                  #append to our df
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)
raw_data['no_stars'] = np.where(raw_data.rating==0, 1, 0)

dropping 5 rows...


In [3]:
#Basic features: lengths of ingredients list, instruction steps, complete instructions
def get_length(lst):
    length = 0
    for step in lst:
        length += len(step)
    return length

#consolidates directions from list to single string
def get_full_dir(lst):
    doc = ''
    for step in lst:
        doc += step + ' '
    return doc

#Build features
raw_data['n_ingredients'] = raw_data.full_ingr.apply(len)
raw_data.n_ingredients = raw_data.n_ingredients/raw_data.n_ingredients.max()
raw_data['n_steps'] = raw_data.full_dir.apply(len)
raw_data.n_steps = raw_data.n_steps/raw_data.n_steps.max()
raw_data['n_words'] = raw_data.full_dir.apply(get_length)
raw_data.n_words = raw_data.n_words/raw_data.n_words.max()
raw_data['n_cats'] = raw_data.full_cats.apply(len)
raw_data.n_cats = raw_data.n_cats/raw_data.n_cats.max()
raw_data['dir_doc'] = raw_data.full_dir.apply(get_full_dir)

In [29]:
#Build tfid vectorizer
vctzr = TfidfVectorizer(ngram_range=(1, 4), max_features=1000)
vctzr.build_analyzer()

#Train on recipe corpus
vctzr = vctzr.fit(raw_data.dir_doc)

#Get sparse matrix of recipe text features
A = vctzr.transform(raw_data.dir_doc)
trans = random_projection.GaussianRandomProjection(eps=.4)
print(A.shape)
A_trans = trans.fit_transform(A)
print(A_trans.shape)


(20047, 1000)
(20047, 675)


In [30]:
all_features = np.concatenate([A_trans, np.array(raw_data[bin_ft + ['n_ingredients', 'n_steps', 'n_words', 'n_cats']])], axis=1)
Y = raw_data['four_stars']

(20047, 40)

In [31]:
#Use random forest classifier to select most important features
clf = RandomForestClassifier(n_estimators=100)
clf.fit(all_features, Y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [46]:
sel = SelectFromModel(clf, prefit=True, threshold=.0019)
X_sel = sel.transform(all_features)

print('number of features selected: ', X_sel.shape[1])

number of features selected:  10


In [33]:
model_data = pd.DataFrame(data=X_txt, index=raw_data.index)
model_data['four_stars'] = raw_data['four_stars']
samp = model_data.sample(frac=.2)
Y = samp['four_stars']
fts = list(samp.columns)
fts.remove('four_stars')
X = samp[fts]

svc = SVC()
prms = {
    'C' : [3.9, 4],
    'gamma' : [.4, .41]
}
srch = GridSearchCV(svc, prms)
srch.fit(X, Y)

print(srch.best_params_, srch.best_score_)

{'C': 4, 'gamma': 0.41} 0.565228236468


In [47]:
model_data = pd.DataFrame(data=X_txt, index=raw_data.index)
model_data['four_stars'] = raw_data['four_stars']
samp = model_data.sample(frac=1)
Y = samp['four_stars']
fts = list(samp.columns)
fts.remove('four_stars')
X = samp[fts]

svc = SVC(C=4, gamma=.4)
result = pd.Series(cross_val_score(svc, X, Y, cv=5))
print(result.mean(), result.std())

0.5894156746174262 0.006330587429076657


In [35]:
print(result.mean(), result.std())

0.5897632968553241 0.009233711069839182


In [36]:
raw_data.four_stars.sum()/len(raw_data)

0.5355414775278097

In [None]:
X = raw_data[[
    'n_ingredients',
    'n_steps',
    'n_cats', 
    'n_words'
]
]