In [1]:
import numpy as np
import pandas as pd
from  sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
from sets import Set

In [2]:
df = pd.read_json('train.json')

In [18]:
IRRELEVANT = ['grated', 
              'ground',
              'extra',
              'virgin',
              'chopped',
              'fresh',
              'large',
              'minced',
              'shredded',
              'extra-virgin',
              'sliced',
              'fine',
              'finely',
              'finely-chopped',
              'freshly',
              'low',
              'sodium',
              'low-fat',
              'sliced',
              'slices',
              'of',
              'a',
              'the',
              'all-purpose',
              'nonfat',
              'non-fat',
              'crumbles',
              'thawed',
              'squeezed']

In [31]:
def all_words(df):
    matrix = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        cuisine = row['cuisine']
        _id = row['id']
        ingredients = row['ingredients']
        for ingredient in ingredients:
            matrix.append([_id, cuisine, ingredient])
    newdf = pd.DataFrame(matrix, columns=['id', 'cuisine', 'ingredient'])
    return newdf

def all_words_from_strs(df):
    matrix = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        cuisine = row['cuisine']
        _id = row['id']
        ingredients = row['ingredients'].split()
        for ingredient in ingredients:
            matrix.append([_id, cuisine, ingredient])
    newdf = pd.DataFrame(matrix, columns=['id', 'cuisine', 'ingredient'])
    return newdf

def ingredient_concat(df):
    matrix = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        cuisine = row['cuisine']
        _id = row['id']
        ingredients = row['ingredients']
        joined_ingredients = " ".join(ingredients)
        matrix.append([_id, cuisine, joined_ingredients])
    newdf = pd.DataFrame(matrix, columns=['id', 'cuisine', 'ingredients'])
    return newdf

def smush(x):
    ret = []
    for ingr in x:
        ingr = ingr.split()
        ingr = "".join(ingr) # Make each ingredient one 'word'
        ret.append(ingr) # Putting all ingredients together
    return " ".join(ret)

def ingredient_join(df):
    matrix = []
    for i in xrange(len(df)):
        row = df.iloc[i]
        cuisine = row['cuisine']
        _id = row['id']
        ingredients = row['ingredients']
        joined_ingredients = smush(ingredients)
        matrix.append([_id, cuisine, joined_ingredients])
    newdf = pd.DataFrame(matrix, columns=['id', 'cuisine', 'ingredients'])
    return newdf

def Xy(df):
    df = df.copy()
    df = remove_irrelevant_words(df, IRRELEVANT)
    my_vocab = high_enough_frequency(df, 0)
    df_mash = ingredient_join(df)
    #dfi = all_words(df)
    #ingredients = dfi['ingredient'].unique()
    #all_ingredients = ["".join(x.split()) for x in ingredients]
    #my_vocab = {x:i for i, x in enumerate(all_ingredients)}
    meh = CountVectorizer(vocabulary=my_vocab)
    #meh = CountVectorizer()
    df_mash = ingredient_join(df)
    matrix = df_mash['ingredients'].values
    sparse_matrix = meh.fit_transform(matrix)
    X = sparse_matrix
    y = df['cuisine'].values
    return X, y.astype(str), meh

def remove_irrelevant_words(df, irrelevant):
    def ingredient_scan(ingr_list):
        recipe_list = []
        for ingr_str in ingr_list:
            ingr_list = ingr_str.split()
            ingr_list = [x for x in ingr_list]
            new_list = [x.replace('-', '') for x in ingr_list if x not in irrelevant]
            new_str = " ".join(new_list)
            recipe_list.append(new_str)
            
        return recipe_list
    df.loc[:, 'ingredients'] =  df['ingredients'].apply(ingredient_scan)
    return df

def high_enough_frequency(df, N):
    dfi = all_words(df)
    countdict = dfi.groupby('ingredient')['ingredient'].count().to_dict()
    
    #construct vocabulary dictionary:
    ret = {}
    i=0
    for ingredient in countdict:
        if countdict[ingredient]>N:
            ret[ingredient] = i
            i += 1   
    #ret = {ingredient:i for i, ingredient in enumerate(countdict) if countdict[ingredient]>N}
    return ret
    
    
    
    

In [32]:
X, y, count_obj = Xy(df)

In [33]:
np.shape(X)

(39774, 6357)

In [34]:
clf = RandomForestClassifier(n_estimators=10)

Random Forest using *all* ~7000 ingredients (with words in IRRELEVANT removed):

In [35]:
scores = cross_val_score(clf, X.toarray(), y, cv=10)
print scores

[ 0.5272271   0.53463855  0.54216867  0.53227832  0.52562814  0.52728187
  0.54101661  0.5327291   0.53576826  0.53642551]


Random Forest Feature importance (with words in IRRELEVANT removed):

In [138]:
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X.toarray(), y)

RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='gini', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0)

In [139]:
importances = clf.feature_importances_
sorted_indices = np.argsort(importances)[::-1]

In [140]:
items = count_obj.vocabulary_.items()
index_as_key = {i:vocab for vocab, i in count_obj.vocabulary_.items()}
N = 500
for i in sorted_indices[:N-1]:
    print index_as_key[i]

oliveoil
parmesancheese
sesameoil
cilantro
ginger
limejuice
soysauce
salt
cumin
chilipowder
turmeric
avocado
mozzarellacheese
salsa
sugar
mirin
eggs
water
garlic
jalapenochilies
butter
onions
fishsauce
blackpepper
corntortillas
pepper
lime
garammasala
allpurposeflour
greenonions
lemonjuice
garliccloves
vegetableoil
ricevinegar
buttermilk
flourtortillas
tumeric
cajunseasoning
cornstarch
carrots
parsley
milk
scallions
unsaltedbutter
currypowder
coconutmilk
basil
cinnamon
coriander
bakingpowder
sourcream
cheddarcheese
koshersalt
shallots
oil
tomatoes
cayennepepper
basilleaves
blackbeans
drywhitewine
lemon
eggyolks
bakingsoda
greenbellpepper
brownsugar
potatoes
driedoregano
shrimp
cucumber
redbellpepper
fetacheesecrumbles
paprika
cuminseed
flour
fetacheese
honey
thyme
mint
chickenbroth
plainyogurt
pecans
oystersauce
heavycream
flatleafparsley
vanillaextract
lemongrass
bayleaves
whitesugar
cookingspray
celery
driedthyme
italianseasoning
purpleonion
onion
nutmeg
parmigianoreggianocheese
cano

In [12]:
dfi = all_words(df)

In [17]:
adict = dfi.groupby('ingredient')['ingredient'].count().to_dict()
newdict = {w}

{u'low-sodium fat-free chicken broth': 22,
 u'sweetened coconut': 3,
 u'baking chocolate': 3,
 u'egg roll wrappers': 57,
 u'bottled low sodium salsa': 1,
 u'vegan parmesan cheese': 4,
 u'clam sauce': 1,
 u'(10 oz.) frozen chopped spinach, thawed and squeezed dry': 2,
 u'figs': 33,
 u'caramels': 10,
 u'broiler': 2,
 u'jalapeno chilies': 1730,
 u'(15 oz.) refried beans': 3,
 u'brioche buns': 3,
 u'broccoli romanesco': 1,
 u'flaked oats': 1,
 u'anise extract': 14,
 u'whole wheat pastry flour': 30,
 u'ravva': 11,
 u'bacon': 620,
 u'millet': 5,
 u'country crock honey spread': 1,
 u'matcha green tea powder': 11,
 u'chopped fresh thyme': 375,
 u'chicken gravy mix': 2,
 u'walnut oil': 14,
 u'Kraft Slim Cut Mozzarella Cheese Slices': 1,
 u'fresh angel hair': 5,
 u'salsify': 2,
 u'galangal': 85,
 u'chicken schmaltz': 3,
 u'butter crackers': 3,
 u'jasmine': 6,
 u'Bisquick Baking Mix': 8,
 u'canned jalapeno peppers': 2,
 u'black grapes': 1,
 u'ground cayenne pepper': 139,
 u'orange soda': 1,
 u'ch