In [2]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
import json

# Read Data Into Pandas DataFrame

In [3]:
with open('train.json','rU') as f:
    data = json.load(f)

In [4]:
cuisines = pd.DataFrame(data)

In [5]:
cuisines.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [6]:
cuisines['ingredient_length']=cuisines.ingredients.apply(len)

In [7]:
cuisines.sort('cuisine')

Unnamed: 0,cuisine,id,ingredients,ingredient_length
1408,brazilian,8015,"[cachaca, pineapple juice, lemon lime beverage]",3
10062,brazilian,48040,"[cider vinegar, salt, flat leaf parsley, sugar...",11
33999,brazilian,28290,"[water, salt, dried oregano, tofu, paprika, re...",12
19408,brazilian,40885,"[bittersweet chocolate, chocolate sprinkles, s...",4
34190,brazilian,12614,"[olive oil, collard greens, rib, salt, black p...",5
12258,brazilian,23569,"[garlic powder, smoked pork, black beans, baco...",10
2908,brazilian,29826,"[ground black pepper, salt, crushed tomatoes, ...",11
2780,brazilian,28080,"[bananas, protein powder, honey, vanilla, ice ...",9
2944,brazilian,26771,"[dried black beans, bacon slices, pork shoulde...",12
2959,brazilian,9398,"[pure vanilla extract, pumpkin purée, unsalted...",8


In [8]:
cuisines.groupby('cuisine').ingredient_length.mean()

cuisine
brazilian        9.520343
british          9.708955
cajun_creole    12.617076
chinese         11.982791
filipino        10.000000
french           9.817838
greek           10.182128
indian          12.705961
irish            9.299850
italian          9.909033
jamaican        12.214829
japanese         9.735067
korean          11.284337
mexican         10.877446
moroccan        12.909866
russian         10.224949
southern_us      9.634954
spanish         10.423660
thai            12.545809
vietnamese      12.675152
Name: ingredient_length, dtype: float64

# Study The Data

In [9]:
cuisines.shape

(39774, 4)

In [10]:
X = cuisines.ingredients

In [11]:
y = cuisines.cuisine

In [12]:
with open('test.json','rU') as f:
    test_data = json.load(f)

In [13]:
test = pd.DataFrame(test_data)

In [14]:
test

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."
5,38527,"[baking powder, all-purpose flour, peach slice..."
6,19666,"[grape juice, orange, white zinfandel]"
7,41217,"[ground ginger, white pepper, green onions, or..."
8,28753,"[diced onions, taco seasoning mix, all-purpose..."
9,22659,"[eggs, cherries, dates, dark muscovado sugar, ..."


In [15]:
X_test = test.ingredients

In [16]:
X_test

0       [baking powder, eggs, all-purpose flour, raisi...
1       [sugar, egg yolks, corn starch, cream of tarta...
2       [sausage links, fennel bulb, fronds, olive oil...
3       [meat cuts, file powder, smoked sausage, okra,...
4       [ground black pepper, salt, sausage casings, l...
5       [baking powder, all-purpose flour, peach slice...
6                  [grape juice, orange, white zinfandel]
7       [ground ginger, white pepper, green onions, or...
8       [diced onions, taco seasoning mix, all-purpose...
9       [eggs, cherries, dates, dark muscovado sugar, ...
10      [pasta, olive oil, crushed red pepper, cherry ...
11      [water, butter, ground sumac, ground lamb, gro...
12      [curry powder, ground cumin, chicken wings, ch...
13      [pasta, marinara sauce, dried basil, chicken f...
14      [salt, custard powder, white sugar, eggs, marg...
15      [vegetable oil cooking spray, egg whites, apri...
16      [vanilla ice cream, banana liqueur, bananas, p...
17      [molas

In [17]:
X_test = X_test.astype('string')

In [18]:
X_test.shape

(9944L,)

In [19]:
X = X.astype('string')

In [20]:
X.shape

(39774L,)

# Function For Implementing Naive Bayes

In [35]:
vect = CountVectorizer(max_df=5000,max_features=2100)
X_dtm = vect.fit_transform(X)
print 'Features: ', X_dtm.shape[1]
X_test_dtm = vect.transform(X_test)
nb = MultinomialNB()
nb.fit(X_dtm, y)
y_pred_class = nb.predict(X_test_dtm)

Features:  2100


In [36]:
y_pred_class

array([u'southern_us', u'southern_us', u'italian', ..., u'italian',
       u'cajun_creole', u'mexican'], 
      dtype='<U12')

In [37]:
sub = pd.DataFrame({'id':test.id, 'cuisine':y_pred_class}).set_index('id')

In [38]:
sub

Unnamed: 0_level_0,cuisine
id,Unnamed: 1_level_1
18009,southern_us
28583,southern_us
41580,italian
29752,cajun_creole
35687,italian
38527,southern_us
19666,spanish
41217,chinese
28753,mexican
22659,british


In [39]:
sub.to_csv('sub1.csv')

# Add length of ingredient list as a feature and implement logistic regression

In [49]:
feature_cols = ['ingredients','ingredient_length']
X_new = cuisines[feature_cols]
y_new = cuisines.cuisine

In [51]:
test['ingredient_length'] = test.ingredients.apply(len)

In [53]:
X_new_test = test[feature_cols]

In [54]:
vect = CountVectorizer(max_df=5000,max_features=2100)
X_new_dtm = vect.fit_transform(X_new.ingredients.astype('string'))

In [55]:
X_new.drop('ingredients', axis=1).shape

(39774, 1)

In [56]:
extra = sp.sparse.csr_matrix(X_new.drop('ingredients', axis=1).astype(float))
extra.shape

(39774, 1)

In [58]:
X_new_dtm_extra = sp.sparse.hstack((X_new_dtm, extra))
X_new_dtm_extra.shape

(39774, 2101)

In [60]:
X_new_test_dtm = vect.transform(X_new_test.ingredients.astype('string'))

In [63]:
extra = sp.sparse.csr_matrix(X_new_test.drop('ingredients', axis=1).astype(float))
X_new_test_dtm_extra = sp.sparse.hstack((X_new_test_dtm, extra))
X_new_test_dtm_extra.shape

(9944, 2101)

In [65]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(C=1e9)
logreg.fit(X_new_dtm_extra, y_new)
y_pred_class = logreg.predict(X_new_test_dtm_extra)




In [66]:
y_pred_class

array([u'irish', u'southern_us', u'italian', ..., u'italian',
       u'southern_us', u'mexican'], dtype=object)

In [68]:
sub = pd.DataFrame({'id':test.id, 'cuisine':y_pred_class}).set_index('id')

In [69]:
sub.to_csv('sub2.csv')