In [1]:
!ls ../data

sample_submission.csv test.json             train.json


In [2]:
!head ../data/sample_submission.csv

id,cuisine
35203,italian
17600,italian
35200,italian
17602,italian
17605,italian
17604,italian
5,italian
7,italian
35209,italian


In [3]:
!head ../data/train.json

[
  {
    "id": 10259,
    "cuisine": "greek",
    "ingredients": [
      "romaine lettuce",
      "black olives",
      "grape tomatoes",
      "garlic",
      "pepper",


In [4]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [5]:
def read(ifn):
    return json.loads(open(ifn, 'r').read())

In [6]:
train_ifn = "../data/train.json"
test_ifn = "../data/test.json"

In [7]:
train_data = read(train_ifn)
test_data = read(test_ifn)

In [8]:
len(train_data)

39774

In [9]:
len(test_data)

9944

In [10]:
cuisine_types = [_[u'cuisine'] for _ in train_data]

In [11]:
len(set(cuisine_types))

20

In [12]:
cuisine_and_ingreds = [(_[u'cuisine'], _["ingredients"]) for _ in train_data]

In [13]:
raw_dset = pd.DataFrame(cuisine_and_ingreds, columns=['cuisine', 'ingredients'])

In [14]:
def createVocabList(ingredients):
    vocabSet = set([])
    for each_ingredient in ingredients:
        vocabSet = vocabSet | set(each_ingredient)
    return list(vocabSet)

In [15]:
ingredient_used = createVocabList(raw_dset.ingredients)

In [16]:
len(ingredient_used)

6714

In [17]:
indices = {word: ingredient_used.index(word) for word in ingredient_used} # cache the indices

In [18]:
def setOfWords2Vec(ingredient_used, one_ingredient):
    returnVec = [0] * len(ingredient_used)
    for word in one_ingredient:
        if word in indices:
            returnVec[indices[word]] = 1
    return np.array(returnVec)

In [19]:
raw_dset['ingred_vec'] = raw_dset.ingredients.apply(lambda x: setOfWords2Vec(ingredient_used, x))

In [20]:
raw_dset.columns

Index([u'cuisine', u'ingredients', u'ingred_vec'], dtype='object')

In [21]:
cuisines = list(set(raw_dset.cuisine))
labels = {cuisine: cuisines.index(cuisine) for cuisine in cuisines}

In [22]:
raw_dset['cuisine_label'] = raw_dset['cuisine'].apply(lambda x: labels[x])

In [23]:
raw_dset.head()

Unnamed: 0,cuisine,ingredients,ingred_vec,cuisine_label
0,greek,"[romaine lettuce, black olives, grape tomatoes...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9
1,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",17
2,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3
3,indian,"[water, vegetable oil, wheat, salt]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10
4,indian,"[black pepper, shallots, cornflour, cayenne pe...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10


## top5 ingredients for each cuisine

In [24]:
grps = raw_dset.groupby('cuisine')

In [25]:
from itertools import chain
from collections import Counter
def findTopFive(ingredients):
    all_ingredients = list(chain(*ingredients))
    return Counter(all_ingredients).most_common(5)

for name, grp in grps:
    print name,"\t\t", ', '.join(map(lambda x: x[0], findTopFive(grp['ingredients'].values)))

brazilian 		salt, onions, olive oil, lime, water
british 		salt, all-purpose flour, butter, milk, unsalted butter
cajun_creole 		salt, onions, garlic, green bell pepper, butter
chinese 		soy sauce, sesame oil, salt, corn starch, sugar
filipino 		salt, garlic, water, onions, soy sauce
french 		salt, sugar, all-purpose flour, unsalted butter, olive oil
greek 		salt, olive oil, dried oregano, garlic cloves, feta cheese crumbles
indian 		salt, onions, garam masala, water, ground turmeric
irish 		salt, butter, all-purpose flour, onions, potatoes
italian 		salt, olive oil, garlic cloves, grated parmesan cheese, garlic
jamaican 		salt, onions, water, garlic, ground allspice
japanese 		soy sauce, salt, mirin, sugar, water
korean 		soy sauce, sesame oil, garlic, green onions, sugar
mexican 		salt, onions, ground cumin, garlic, olive oil
moroccan 		salt, olive oil, ground cumin, onions, ground cinnamon
russian 		salt, sugar, onions, all-purpose flour, sour cream
southern_us 		salt, butter, all-p

# split the train and test

In [26]:
from sklearn import cross_validation

In [27]:
X, y = raw_dset.ingred_vec.values, raw_dset.cuisine_label.values

In [28]:
data = []
for row in X:
    data.append(row.tolist())

In [29]:
X = np.array(data)

In [30]:
random_state = 0
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, 
                                                                     y, 
                                                                     random_state=random_state,
                                                                     test_size=0.33)

## Try Naive Gaussian
NB Gaussian shown to be robust in text classifying

In [31]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [32]:
gnb = GaussianNB()
my_results = gnb.fit(X_train, y_train).predict(X_test)

In [33]:
accuracy_score(y_test, my_results)

0.3850373304891056

## Try RandomForest
So far the gold stone

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rnd = RandomForestClassifier()
my_results = rnd.fit(X_train, y_train).predict(X_test)
accuracy_score(y_test, my_results)

0.66882523236324853