In [69]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn import random_projection
import datetime
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [70]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

bin_ft = list(raw_data.columns)
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')

missing = []
for row in raw_data.index:      #Find recipes missing from full-form set
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row) 
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:             #Drop the recipes without full-form available
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:      #Record indices for each full-form recipe in order of csv
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])


dropping 5 rows...


In [71]:
#Grab wanted data from complete recipes:
categories = []
directions = []
ingredients = []
ratings = []
dates = []
descrips = []
for i in full_ind:
    row = full.loc[i]
    categories.append(row.categories)                          #List of categories
    directions.append(row.directions)                           #List of directions
    ingredients.append(row.ingredients)                         #List of ingredients
    descrips.append(row.desc)                            #Description when available
    ratings.append(row.rating)                                 #Reconcile mismatched ratings
    dates.append(row.date)                                     #Get dates
    
raw_data['full_cats'] = categories                  #append to our df
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients
raw_data['description'] = descrips
raw_data.rating = ratings
raw_data['date_'] = dates

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)
raw_data['no_stars'] = np.where(raw_data.rating==0, 1, 0)


In [72]:
latest_date = raw_data.date_.max()
raw_data.t_delta = latest_date-raw_data.date_
raw_data.days_old = raw_data.t_delta.apply(lambda x: x.days)
raw_data['age'] = raw_data.days_old/raw_data.days_old.max()

In [73]:
#Basic features: lengths of ingredients list, instruction steps, complete instructions
def get_length(lst):
    length = 0
    for step in lst:
        length += len(step)
    return length

#consolidates directions from list to single string
def get_full_dir(lst):
    doc = ''
    for step in lst:
        doc += step + ' '
    return doc

#Build features
raw_data['n_ingredients'] = raw_data.full_ingr.apply(len)
raw_data.n_ingredients = raw_data.n_ingredients/raw_data.n_ingredients.max()
raw_data['n_steps'] = raw_data.full_dir.apply(len)
raw_data.n_steps = raw_data.n_steps/raw_data.n_steps.max()
raw_data['n_words'] = raw_data.full_dir.apply(get_length)
raw_data.n_words = raw_data.n_words/raw_data.n_words.max()
raw_data['n_cats'] = raw_data.full_cats.apply(len)
raw_data.n_cats = raw_data.n_cats/raw_data.n_cats.max()
raw_data['dir_doc'] = raw_data.full_dir.apply(get_full_dir)

raw_data['has_desc'] = np.where(raw_data.description, 1, 0)

In [74]:
cat = []
mean_rating = []
class_imb = []
rating_std = []
freq = []
for ft in bin_ft:
    cat.append(ft)
    df = raw_data[raw_data[ft]==1]
    mean_rating.append(df.rating.mean())
    class_imb.append(df.four_stars.sum()/len(df))
    rating_std.append(df.rating.std())
    freq.append(len(df))
    
result=pd.DataFrame()
result['category'] = cat
result['mean_rating'] = mean_rating
result['class_imbalance'] = class_imb
result['rating_std'] = rating_std
result['freq'] = freq

In [77]:
total_class_imb = raw_data.four_stars.sum()/len(raw_data)
print(total_class_imb)
result['cb_dev'] = abs(total_class_imb - result['class_imbalance'])

0.5371377263430938


In [None]:
result.loc[result[result.freq>500].cb_dev.sort_values(ascending=False).index]

In [110]:
features=[
    'drink',
    'bon appétit',
    'gin',
    'house & garden',
    'alcoholic',
    'goat cheese',
    'roast',
    'family reunion',
    'cabbage',
    'fourth of july',
    'thanksgiving',
    'low fat',
    'low carb',
    'christmas'
]


In [89]:
def get_features(result, min_freq=500, n_features=30):
    df = result.loc[result[result.freq >= min_freq].cb_dev.sort_values(ascending=False).index]
    cutoff = int(n_features/2)
    df_top = list(df[:cutoff].category)
    df_bom = list(df[-cutoff:].category)
    return df_top + df_bom

fts = get_features(result, n_features=24, min_freq=500)

In [90]:
fts

['alcoholic',
 'drink',
 'cocktail party',
 'roast',
 'grill',
 'thanksgiving',
 'low fat',
 'christmas',
 'backyard bbq',
 'grill/barbecue',
 'pasta',
 'yogurt',
 'citrus',
 'carrot',
 'sauté',
 'egg',
 'spring',
 'lime',
 'soup/stew',
 'picnic',
 'kid-friendly',
 'vegetarian',
 'breakfast',
 'dairy free']

In [116]:
samp = raw_data.sample(frac=.2, random_state=101)
X = samp[['n_ingredients', 'n_steps', 'n_words', 'n_cats', 'age', 'has_desc'] + features]
Y = samp['four_stars']

svc = SVC()
params = {
    'C' : [43, 41],
    'gamma' : [.135, .145]
}
srch = GridSearchCV(svc, params)
srch.fit(X, Y)
print(srch.best_params_, srch.best_score_)

{'C': 43, 'gamma': 0.145} 0.590670990272


In [None]:
samp = raw_data
X = samp[['n_ingredients', 'n_steps', 'n_words', 'n_cats', 'age', 'has_desc'] + features]
Y = samp['four_stars']

svc = SVC(C=43, gamma=.145)
rslt = cross_val_score(svc, X, Y)
print(rslt.mean(), rslt.std())

In [29]:
#Build tfid vectorizer
vctzr = TfidfVectorizer(ngram_range=(1, 4), max_features=1000)
vctzr.build_analyzer()

#Train on recipe corpus
vctzr = vctzr.fit(raw_data.dir_doc)

#Get sparse matrix of recipe text features
A = vctzr.transform(raw_data.dir_doc)
trans = random_projection.GaussianRandomProjection(eps=.4)
print(A.shape)
A_trans = trans.fit_transform(A)
print(A_trans.shape)


(20047, 1000)
(20047, 675)


In [30]:
all_features = np.concatenate([A_trans, np.array(raw_data[bin_ft + ['n_ingredients', 'n_steps', 'n_words', 'n_cats']])], axis=1)
Y = raw_data['four_stars']

(20047, 40)

In [31]:
#Use random forest classifier to select most important features
clf = RandomForestClassifier(n_estimators=100)
clf.fit(all_features, Y)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [46]:
sel = SelectFromModel(clf, prefit=True, threshold=.0019)
X_sel = sel.transform(all_features)

print('number of features selected: ', X_sel.shape[1])

number of features selected:  10


In [33]:
model_data = pd.DataFrame(data=X_txt, index=raw_data.index)
model_data['four_stars'] = raw_data['four_stars']
samp = model_data.sample(frac=.2)
Y = samp['four_stars']
fts = list(samp.columns)
fts.remove('four_stars')
X = samp[fts]

svc = SVC()
prms = {
    'C' : [3.9, 4],
    'gamma' : [.4, .41]
}
srch = GridSearchCV(svc, prms)
srch.fit(X, Y)

print(srch.best_params_, srch.best_score_)

{'C': 4, 'gamma': 0.41} 0.565228236468


In [47]:
model_data = pd.DataFrame(data=X_txt, index=raw_data.index)
model_data['four_stars'] = raw_data['four_stars']
samp = model_data.sample(frac=1)
Y = samp['four_stars']
fts = list(samp.columns)
fts.remove('four_stars')
X = samp[fts]

svc = SVC(C=4, gamma=.4)
result = pd.Series(cross_val_score(svc, X, Y, cv=5))
print(result.mean(), result.std())

0.5894156746174262 0.006330587429076657


In [35]:
print(result.mean(), result.std())

0.5897632968553241 0.009233711069839182


In [36]:
raw_data.four_stars.sum()/len(raw_data)

0.5355414775278097

In [None]:
X = raw_data[[
    'n_ingredients',
    'n_steps',
    'n_cats', 
    'n_words'
]
]