In [1]:
!ls ../data

sample_submission.csv test.json             train.json


In [2]:
!head ../data/sample_submission.csv

id,cuisine
35203,italian
17600,italian
35200,italian
17602,italian
17605,italian
17604,italian
5,italian
7,italian
35209,italian


In [3]:
!head ../data/train.json

[
  {
    "id": 10259,
    "cuisine": "greek",
    "ingredients": [
      "romaine lettuce",
      "black olives",
      "grape tomatoes",
      "garlic",
      "pepper",


In [4]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
def read(ifn):
    return json.loads(open(ifn, 'r').read())

In [6]:
train_ifn = "../data/train.json"
test_ifn = "../data/test.json"

In [7]:
train_data = read(train_ifn)
test_data = read(test_ifn)

In [8]:
len(train_data)

39774

In [9]:
len(test_data)

9944

In [10]:
cuisine_types = [_[u'cuisine'] for _ in train_data]

In [11]:
len(set(cuisine_types))

20

In [12]:
cuisine_and_ingreds = [(_[u'cuisine'], _["ingredients"]) for _ in train_data]

In [13]:
raw_dset = pd.DataFrame(cuisine_and_ingreds, columns=['cuisine', 'ingredients'])

In [14]:
raw_dset.head()

Unnamed: 0,cuisine,ingredients
0,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,"[water, vegetable oil, wheat, salt]"
4,indian,"[black pepper, shallots, cornflour, cayenne pe..."


## tf-idf method (term frequency–inverse document frequency)
https://en.wikipedia.org/wiki/Tf%E2%80%93idf

### calculate term frequency

In [15]:
def tf(tokens):
    '''take a list of tokens and returns a Python dictionary mapping tokens to TF weights
    '''
    from collections import Counter
    return Counter(tokens)

In [16]:
from itertools import chain

In [17]:
tfs = {}

In [18]:
for name, group in raw_dset.groupby('cuisine'):
    ingredients = list(chain(*group.ingredients.values))
    tfs[name] = tf(ingredients)

### calculate inverse document frequence

In [19]:
cuisines_ingredients = {} # the ingredients of each cuisine
all_ingredients = list(chain(*raw_dset.ingredients.values))

In [20]:
for name, group in raw_dset.groupby('cuisine'):
    ingredients = list(chain(*group.ingredients.values))
    cuisines_ingredients[name] = set(ingredients)

In [21]:
idf = {}

In [22]:
from math import log10
for ingredient in set(all_ingredients):
    num_cuisines_contains_this_ingredient = len([_ for _ in cuisines_ingredients
                                                 if ingredient in cuisines_ingredients[_]])
    idf[ingredient] = log10(20 / num_cuisines_contains_this_ingredient)

### Top 4 most unique ingredients in each cuisine
中国菜多黄酒，泰国菜多鱼露，意大利喜欢千层面，墨西哥菜多吃豆子，日本菜吃味醂，whole lot of sense!!

In [23]:
for cuisine, ingredients in cuisines_ingredients.iteritems():
    tf_idfs = [(ingredient, tfs[cuisine][ingredient] * idf[ingredient]) 
               for ingredient in ingredients]
    tf_idfs.sort(key=lambda t: t[1], reverse=True)
    print cuisine, ":", ", ".join([t[0] for t in tf_idfs[:4]])

irish : Irish whiskey, Guinness Beer, irish cream liqueur, corned beef
mexican : refried beans, enchilada sauce, taco seasoning mix, taco seasoning
chinese : Shaoxing wine, oyster sauce, hoisin sauce, chinese rice wine
filipino : fish sauce, calamansi juice, lumpia wrappers, calamansi
vietnamese : fish sauce, beansprouts, rice paper, thai basil
moroccan : ras el hanout, couscous, preserved lemon, saffron threads
spanish : saffron threads, chorizo sausage, spanish chorizo, serrano ham
japanese : mirin, sake, dashi, nori
french : gruyere cheese, grated Gruyère cheese, fresh tarragon, calvados
greek : feta cheese crumbles, feta cheese, greek seasoning, kalamata
indian : garam masala, curry leaves, paneer, ghee
cajun_creole : cajun seasoning, andouille sausage, creole seasoning, file powder
british : stilton cheese, suet, beef drippings, stilton
brazilian : cachaca, açai, manioc flour, palm oil
russian : sauerkraut, buckwheat flour, pierogi, farmer cheese
jamaican : scotch bonnet chile, ja


### Top 4 most common ingredients in all cuisine

In [24]:
for cuisine, ingredients in cuisines_ingredients.iteritems():
    tf_idfs = [(ingredient, tfs[cuisine][ingredient] * idf[ingredient]) 
               for ingredient in ingredients]
    tf_idfs.sort(key=lambda t: t[1])
    print cuisine, ":", ", ".join([t[0] for t in tf_idfs[:4]])

irish : red cabbage, dijon mustard, yellow corn meal, sweet potatoes
mexican : red cabbage, ground cardamom, pineapple juice, boneless skinless chicken breast halves
chinese : red cabbage, pineapple juice, boneless skinless chicken breast halves, fresh leav spinach
filipino : dijon mustard, sweet potatoes, ground cardamom, milk
vietnamese : red cabbage, dijon mustard, sweet potatoes, seasoning salt
moroccan : red cabbage, dijon mustard, extra large eggs, sweet potatoes
spanish : dijon mustard, green pepper, sausage links, medium shrimp uncook
japanese : red cabbage, ground cardamom, pineapple juice, boneless skinless chicken breast halves
french : red cabbage, ground cardamom, pineapple juice, boneless skinless chicken breast halves
greek : dijon mustard, bell pepper, extra large eggs, green pepper
indian : red cabbage, boneless skinless chicken breast halves, fresh leav spinach, hard-boiled egg
cajun_creole : red cabbage, ground cardamom, pineapple juice, boneless skinless chicken bre

### Use most unique ingredients in each cuisine

In [25]:
unique_ingredients = {}
top_percent = 0.8
for cuisine, ingredients in cuisines_ingredients.iteritems():
    tf_idfs = [(ingredient, tfs[cuisine][ingredient] * idf[ingredient]) 
               for ingredient in ingredients]
    tf_idfs.sort(key=lambda t: t[1], reverse=True)
    num_ingredients = len(ingredients)
    tops = int(num_ingredients * top_percent)
    unique_ones = tf_idfs[0 : tops]
    unique_ones = [t[0] for t in unique_ones]
    unique_ingredients[cuisine] = unique_ones

In [26]:
unique_ingredient_used = list(set(chain(*unique_ingredients.values())))

## Baseline

In [27]:
def createVocabList(ingredients):
    vocabSet = set([])
    for each_ingredient in ingredients:
        vocabSet = vocabSet | set(each_ingredient)
    return list(vocabSet)

In [28]:
ingredient_used = createVocabList(raw_dset.ingredients)

In [29]:
len(ingredient_used)

6714

In [30]:
indices = {word: ingredient_used.index(word) for word in ingredient_used} # cache the indices

In [31]:
def setOfWords2Vec(ingredient_used, one_ingredient):
    returnVec = [0] * len(ingredient_used)
    for word in one_ingredient:
        if word in indices:
            returnVec[indices[word]] = 1
    return np.array(returnVec)

In [32]:
raw_dset['ingred_vec'] = raw_dset.ingredients.apply(lambda x: setOfWords2Vec(ingredient_used, x))

In [33]:
cuisines = list(set(raw_dset.cuisine))
labels = {cuisine: cuisines.index(cuisine) for cuisine in cuisines}

In [34]:
raw_dset['cuisine_label'] = raw_dset['cuisine'].apply(lambda x: labels[x])

## Use more unique ingredients

In [35]:
len(unique_ingredient_used)

6647

In [36]:
uniq_indices = {word: unique_ingredient_used.index(word) for word in unique_ingredient_used} # cache the indices

In [37]:
def setOfWords2Vec(ingredient_used, one_ingredient):
    returnVec = [0] * len(ingredient_used)
    for word in one_ingredient:
        if word in uniq_indices:
            returnVec[uniq_indices[word]] = 1
    return np.array(returnVec)

In [38]:
raw_dset['unique_ingred_vec'] = raw_dset.ingredients.apply(lambda x: setOfWords2Vec(unique_ingredient_used, x))

# split the train and test

In [39]:
from sklearn import cross_validation

In [40]:
X, y = raw_dset.ingred_vec.values, raw_dset.cuisine_label.values
uniq_X = raw_dset.unique_ingred_vec.values

In [41]:
data = []
for row in X:
    data.append(row.tolist())
X = np.array(data)

In [42]:
u_data = []
for row in uniq_X:
    u_data.append(row.tolist())
u_X = np.array(u_data)

In [43]:
random_state = 0
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, 
                                                                     y, 
                                                                     random_state=random_state,
                                                                     test_size=0.33)
u_X_train, u_X_test, u_y_train, u_y_test = cross_validation.train_test_split(u_X, 
                                                                     y, 
                                                                     random_state=random_state,
                                                                     test_size=0.33)

## Try RandomForest
So far the gold stone

In [44]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [47]:
rnd = RandomForestClassifier(n_estimators=10)
my_results = rnd.fit(X_train, y_train).predict(X_test)
accuracy_score(y_test, my_results)

0.66852049367667221

### Use only the most unique ingredients

In [48]:
rnd = RandomForestClassifier(n_estimators=10)
my_results = rnd.fit(u_X_train, u_y_train).predict(u_X_test)
accuracy_score(u_y_test, my_results)

0.66387322870638432