In [None]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, GridSearchCV
from math import e
%matplotlib inline


Epicurious data from [Kaggle](https://www.kaggle.com/hugodarwood/epirecipes). 

In [None]:
raw_data = pd.read_csv('epi_r.csv')
full = pd.read_json('full_format_recipes.json')

missing = []
for row in raw_data.index:
    if raw_data.loc[row].title not in list(full.title):
        missing.append(row)
        

print('dropping {} rows...'.format(len(missing)))
for row in missing:
    raw_data = raw_data.drop(row)
    
full_ind = []
for row in raw_data.index:
    full_ind.append(list(full[full.title==raw_data.loc[row].title].index)[0])
    
categories = []
directions = []
ingredients = []
for i in full_ind:
    row = full.loc[i]
    cat, dr, ing = row.categories, row.directions, row.ingredients
    categories.append(cat)
    directions.append(dr)
    ingredients.append(ing)
    
raw_data['full_cats'] = categories
raw_data['full_dir'] = directions
raw_data['full_ingr'] = ingredients

raw_data['four_stars'] = np.where(raw_data.rating>4, 1, 0)

In [None]:
categories = set()
for row in raw_data.index:
    categories = categories.union(set(raw_data.loc[row].full_cats))

In [None]:
raw_data.loc[5001].full_dir

In [None]:
bin_ft = list(raw_data.columns)
bin_ft.remove('four_stars')
bin_ft.remove('title')
bin_ft.remove('rating')
bin_ft.remove('calories')
bin_ft.remove('fat')
bin_ft.remove('protein')
bin_ft.remove('sodium')
bin_ft.remove('full_dir')
bin_ft.remove('full_cats')
bin_ft.remove('full_ingr')

In [None]:
cat = []
mean_rating = []
rating_std = []
freq = []
for ft in bin_ft:
    cat.append(ft)
    df = raw_data[raw_data[ft]==1]
    mean_rating.append(df.rating.mean())
    rating_std.append(df.rating.std())
    freq.append(len(df))
    
result=pd.DataFrame()
result['category'] = cat
result['mean_rating'] = mean_rating
result['rating_std'] = rating_std
result['freq'] = freq

In [None]:
result.loc[result[result.freq>3].mean_rating.idxmax()]

In [None]:
def get_features(result, min_freq=500, n_features=30):
    df = result.loc[result[result.freq >= min_freq].mean_rating.sort_values(ascending=False).index]
    cutoff = int(n_features/2)
    df_top = list(df[:cutoff].category)
    df_bom = list(df[-cutoff:].category)
    return df_top + df_bom

get_features(result, n_features=14)

In [None]:
freq_prm = []
num_ft = []
mean_score = []
sc_std = []

for freq in range(100, 2000, 100):
    for num in range(4, 30, 2):
        num_ft.append(num)
        freq_prm.append(freq)
        features = get_features(result, min_freq=freq, n_features=num)
        samp = raw_data.sample(frac=.1, random_state=100)
        Y = samp.four_stars
        X = samp[features]
        svc = SVC()
        scores = cross_val_score(svc, X, Y, cv=5)
        print('freq: {}, n: {},  mean score: {}+/-{}'.format(freq, num, scores.mean(), scores.std()))
        mean_score.append(scores.mean())
        sc_std.append(scores.std())
    
ressy = pd.DataFrame()
ressy['frequency_param'] = freq_prm
ressy['mean_score'] = mean_score
ressy['sc_std'] = sc_std
ressy['n_features'] = num_ft

ressy.loc[ressy.mean_score.idxmax()]

In [None]:
fts = get_features(result, min_freq=1100, n_features=26)

fts += list(result.loc[result.freq.sort_values(ascending=False).index][:4].category)

X = raw_data[fts]
Y = raw_data['four_stars']

In [None]:
svc = SVC()
cross_val_score(svc, X, Y, cv=5)

In [None]:
samp = raw_data.sample(frac=.2)
X = samp[fts]
Y = samp.four_stars
for rump in range(0, 5):
    gm = .011867
    c = (10**rump)
    mod = SVC(gamma=gm, C=c)
    res = pd.Series(cross_val_score(mod, X, Y, cv=5))
    print(c, res.mean(), res.std())

In [None]:
freq_prm = []
mean_score = []
sc_std = []

for freq in range(100, 2000, 100):
    freq_prm.append(freq)
    features = get_features(result, min_freq=freq)
    samp = raw_data.sample(frac=.1, random_state=100)
    Y = samp.four_stars
    X = samp[features]
    svc = SVC()
    scores = cross_val_score(svc, X, Y, cv=5)
    print('freq: {}  mean score: {}+/-{}'.format(freq, scores.mean(), scores.std()))
    mean_score.append(scores.mean())
    sc_std.append(scores.std())
    
ressy = pd.DataFrame()
ressy['frequency_param'] = freq_prm
ressy['mean_score'] = mean_score
ressy['sc_std'] = sc_std


In [None]:
freq_prm = []
mean_score = []
sc_std = []

for freq in range(900, 1100, 10):
    freq_prm.append(freq)
    features = get_features(result, min_freq=freq)
    samp = raw_data.sample(frac=.1, random_state=100)
    Y = samp.four_stars
    X = samp[features]
    svc = SVC()
    scores = cross_val_score(svc, X, Y, cv=5)
    print('freq: {}  mean score: {}+/-{}'.format(freq, scores.mean(), scores.std()))
    mean_score.append(scores.mean())
    sc_std.append(scores.std())
    
ressy = pd.DataFrame()
ressy['frequency_param'] = freq_prm
ressy['mean_score'] = mean_score
ressy['sc_std'] = sc_std

In [None]:
cat = []
mean_rating = []
rating_std = []
zero_rating = []
freq = []
for ft in bin_ft:
    cat.append(ft)
    df = raw_data[raw_data[ft]==1]
    mean_rating.append(df.rating.mean())
    rating_std.append(df.rating.std())
    zero_rating.append(len(df[df.rating==0]))
    freq.append(len(df))
    
result=pd.DataFrame()
result['category'] = cat
result['mean_rating'] = mean_rating
result['rating_std'] = rating_std
result['freq'] = freq
result['zero_rating'] = zero_rating
result['zero_prop'] = result.zero_rating/result.freq

In [None]:
result.loc[result[result.freq>10]['zero_prop'].sort_values(ascending=False).index]

In [None]:
ft = get_features(result, min_freq=1100, n_features=26)

In [None]:
svc = SVC()
samp = raw_data.sample(frac=.1)
X=samp[ft]
Y=samp['four_stars']
params = {
    'gamma': [1, .1, .01, .001],
    'C': [1, 10, 100],
    'kernel': ['rbf', 'poly', 'sigmoid']
}
    
gs = GridSearchCV(svc, params)

gs.fit(X, Y)

tb = pd.crosstab(gs.predict(raw_data[ft]), raw_data['four_stars'])
print(tb)
print((tb.iloc[0,0]+tb.iloc[1,1])/tb.sum().sum())