In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn import random_projection
import datetime
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [2]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

bin_ft = list(raw_data.columns)
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')

missing = []
for row in raw_data.index:      #Find recipes missing from full-form set
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row) 
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:             #Drop the recipes without full-form available
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:      #Record indices for each full-form recipe in order of csv
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])


dropping 5 rows...


In [3]:
#Grab wanted data from complete recipes:
categories = []
directions = []
ingredients = []
ratings = []
dates = []
descrips = []
for i in full_ind:
    row = full.loc[i]
    categories.append(row.categories)                          #List of categories
    directions.append(row.directions)                           #List of directions
    ingredients.append(row.ingredients)                         #List of ingredients
    descrips.append(row.desc)                            #Description when available
    ratings.append(row.rating)                                 #Reconcile mismatched ratings
    dates.append(row.date)                                     #Get dates
    
raw_data['full_cats'] = categories                  #append to our df
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients
raw_data['description'] = descrips
raw_data.rating = ratings
raw_data['date_'] = dates

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)
raw_data['no_stars'] = np.where(raw_data.rating==0, 1, 0)


In [4]:
latest_date = raw_data.date_.max()
raw_data.t_delta = latest_date-raw_data.date_
raw_data.days_old = raw_data.t_delta.apply(lambda x: x.days)
raw_data['age'] = raw_data.days_old/raw_data.days_old.max()

In [5]:
#Basic features: lengths of ingredients list, instruction steps, complete instructions
def get_length(lst):
    length = 0
    for step in lst:
        length += len(step)
    return length

#consolidates directions from list to single string
def get_full_dir(lst):
    doc = ''
    for step in lst:
        doc += step + ' '
    return doc

#Build features
raw_data['n_ingredients'] = raw_data.full_ingr.apply(len)
raw_data.n_ingredients = raw_data.n_ingredients/raw_data.n_ingredients.max()
raw_data['n_steps'] = raw_data.full_dir.apply(len)
raw_data.n_steps = raw_data.n_steps/raw_data.n_steps.max()
raw_data['n_words'] = raw_data.full_dir.apply(get_length)
raw_data.n_words = raw_data.n_words/raw_data.n_words.max()
raw_data['n_cats'] = raw_data.full_cats.apply(len)
raw_data.n_cats = raw_data.n_cats/raw_data.n_cats.max()
raw_data['dir_doc'] = raw_data.full_dir.apply(get_full_dir)

raw_data['has_desc'] = np.where(raw_data.description, 1, 0)

In [6]:
cat = []
mean_rating = []
class_imb = []
rating_std = []
freq = []
for ft in bin_ft:
    cat.append(ft)
    df = raw_data[raw_data[ft]==1]
    mean_rating.append(df.rating.mean())
    class_imb.append(df.four_stars.sum()/len(df))
    rating_std.append(df.rating.std())
    freq.append(len(df))
    
result=pd.DataFrame()
result['category'] = cat
result['mean_rating'] = mean_rating
result['class_imbalance'] = class_imb
result['rating_std'] = rating_std
result['freq'] = freq

In [7]:
total_class_imb = raw_data.four_stars.sum()/len(raw_data)
print(total_class_imb)
result['cb_dev'] = abs(total_class_imb - result['class_imbalance'])

0.5371377263430938


In [8]:
result.loc[result[result.freq>500].cb_dev.sort_values(ascending=False).index].head()

Unnamed: 0,category,mean_rating,class_imbalance,rating_std,freq,cb_dev
8,alcoholic,2.288174,0.354491,2.218813,835,0.182647
186,drink,2.398804,0.367985,2.197344,1087,0.169152
134,cocktail party,2.956564,0.405546,1.930393,1154,0.131592
520,roast,4.091641,0.667172,0.920815,1319,0.130034
250,grill,3.963523,0.661922,1.237236,562,0.124784


In [9]:
features=[
    'drink',
    'bon appétit',
    'gin',
    'house & garden',
    'alcoholic',
    'goat cheese',
    'roast',
    'family reunion',
    'cabbage',
    'fourth of july',
    'thanksgiving',
    'low fat',
    'low carb',
    'christmas'
]


In [10]:
def get_features(result, min_freq=500, n_features=30):
    df = result.loc[result[result.freq >= min_freq].cb_dev.sort_values(ascending=False).index]
    cutoff = int(n_features/2)
    df_top = list(df[:cutoff].category)
    df_bom = list(df[-cutoff:].category)
    return df_top + df_bom

fts = get_features(result, n_features=24, min_freq=500)

In [11]:
fts

['alcoholic',
 'drink',
 'cocktail party',
 'roast',
 'grill',
 'thanksgiving',
 'low fat',
 'christmas',
 'backyard bbq',
 'grill/barbecue',
 'pasta',
 'yogurt',
 'citrus',
 'carrot',
 'sauté',
 'egg',
 'spring',
 'lime',
 'soup/stew',
 'picnic',
 'kid-friendly',
 'vegetarian',
 'breakfast',
 'dairy free']

In [13]:
samp = raw_data
X = samp[['n_ingredients', 'n_steps', 'n_words', 'n_cats', 'age', 'has_desc'] + features]
Y = samp['four_stars']

svc = SVC(C=43, gamma=.145)
rslt = cross_val_score(svc, X, Y, cv=5)
print(rslt.mean(), rslt.std())

0.599242592142 0.00513893522897


In [14]:
total_class_imb

0.5371377263430938

<H2>Conclusion</H2><br>
After all this work, I'm able to predict ratings above or before the four star threshold to a little over 6% better than random guessing.