In [2]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [3]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

bin_ft = list(raw_data.columns)
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')

missing = []
for row in raw_data.index:      #Find recipes missing from full-form set
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row) 
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:             #Drop the recipes without full-form available
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:      #Record indices for each full-form recipe in order of csv
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])

#Grab wanted data from complete recipes:
categories = []
directions = []
ingredients = []                
for i in full_ind:
    row = full.loc[i]
    cat, dr, ing = row.categories, row.directions, row.ingredients
    categories.append(cat)                          #List of categories
    directions.append(dr)                           #List of directions
    ingredients.append(ing)                         #List of ingredients
    
raw_data['full_cats'] = categories                  #append to our df
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)
raw_data['no_stars'] = np.where(raw_data.rating==0, 1, 0)

dropping 5 rows...


In [4]:
#Basic features: lengths of ingredients list, instruction steps, complete instructions
def get_length(lst):
    length = 0
    for step in lst:
        length += len(step)
    return length

#consolidates directions from list to single string
def get_full_dir(lst):
    doc = ''
    for step in lst:
        doc += step + ' '
    return doc

#Build features
raw_data['n_ingredients'] = raw_data.full_ingr.apply(len)
raw_data.n_ingredients = raw_data.n_ingredients/raw_data.n_ingredients.max()
raw_data['n_steps'] = raw_data.full_dir.apply(len)
raw_data.n_steps = raw_data.n_steps/raw_data.n_steps.max()
raw_data['n_words'] = raw_data.full_dir.apply(get_length)
raw_data.n_words = raw_data.n_words/raw_data.n_words.max()
raw_data['dir_doc'] = raw_data.full_dir.apply(get_full_dir)

In [10]:
#Build tfid vectorizer
vctzr = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)
vctzr.build_analyzer()

#Train on recipe corpus
vctzr = vctzr.fit(raw_data.dir_doc)

#Get sparse matrix of recipe text features
A = vctzr.transform(raw_data.dir_doc)
X = np.concatenate([A.todense(), np.array(raw_data[bin_ft + ['n_ingredients', 'n_steps', 'n_words']])], axis=1)
Y = raw_data['four_stars']

X.shape

(20047, 5677)

In [12]:
feature_names = list(vctzr.get_feature_names()) + bin_ft + ['n_ingredients', 'n_steps', 'n_words']

#Use random forest classifier to select most important models
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, Y)


5677
(20047, 5677)
most important recipe text features:  ['n_ingredients', 'n_words']




In [15]:
sel = SelectFromModel(clf, prefit=True, threshold=.0015)
print(X.shape)
X_txt = sel.transform(X)

txt_map = list(sel.transform(feature_names).ravel())
print('most important recipe text features: ', txt_map)
print('number of features selected: ', len(txt_map))

(20047, 5677)
most important recipe text features:  ['about', 'add', 'and', 'bowl', 'can', 'combine', 'glass', 'heat', 'in', 'in large', 'into', 'large', 'medium', 'minutes', 'mixture', 'of', 'or', 'over', 'pepper', 'salt', 'serve', 'small', 'stir', 'the', 'to', 'until', 'water', 'with', 'n_ingredients', 'n_steps', 'n_words']
number of features selected:  31




In [None]:
model_data = pd.DataFrame(data=X_txt, columns=txt_map, index=raw_data.index)
model_data['four_stars'] = raw_data['four_stars']
Y = model_data['four_stars']

svc = SVC(gamma=.03, C=4)
cross_val_score(svc, X, Y, cv=5)