In [1]:
# Standard Imports
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Performance
from time import time

In [8]:
# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

In [41]:
# Helper
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn import metrics

# Preliminaries

In [5]:
# Load in the Data
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [25]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [37]:
# null accuracy (what is accuracy if always predict most prevalent class)
# this gives an idea of the underlying distribution of response values
train.cuisine.value_counts()
train.cuisine.count()
null_accuracy = train.cuisine.value_counts().max()/train.cuisine.count()

# alternative ways of calculating null accuracy
# null_accuracy = train.cuisine.value_counts().head(1) / len(train.cuisine)
# max(train.cuisine.mean(), 1 - train.cuisine.mean()) # if binary classifiation

print("Null accuracy is", null_accuracy)

Null accuracy is 0.197063408257


In [26]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [7]:
# Extract the Unique Ingredients
print("Parsing train ingredients...")
words_train = [' '.join(item) for item in train.ingredients]
print("Parsing test ingredients...")
words_test = [' '.join(item) for item in test.ingredients]
words_train[0:10]

['romaine lettuce black olives grape tomatoes garlic pepper purple onion seasoning garbanzo beans feta cheese crumbles',
 'plain flour ground pepper salt tomatoes ground black pepper thyme eggs green tomatoes yellow corn meal milk vegetable oil',
 'eggs pepper salt mayonaise cooking oil green chilies grilled chicken breasts garlic powder yellow onion soy sauce butter chicken livers',
 'water vegetable oil wheat salt',
 'black pepper shallots cornflour cayenne pepper onions garlic paste milk butter salt lemon juice water chili powder passata oil ground cumin boneless chicken skinless thigh garam masala double cream natural yogurt bay leaf',
 'plain flour sugar butter eggs fresh ginger root salt ground cinnamon milk vanilla extract ground ginger powdered sugar baking powder',
 'olive oil salt medium shrimp pepper garlic chopped cilantro jalapeno chilies flat leaf parsley skirt steak white vinegar sea salt bay leaf chorizo sausage',
 'sugar pistachio nuts white almond bark flour vanilla e

In [9]:
print("Vectorizing...")
vectorizer = TfidfVectorizer(max_df=0.15)
bag_of_words_train = vectorizer.fit_transform(words_train)
bag_of_words_test = vectorizer.transform(words_test)

Vectorizing...


In [None]:
# reduce dimension...
# from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
# fselect = SelectKBest(chi2 , k=70000)
# train_data_features = fselect.fit_transform(bag_of_words_train, train["cuisine"])
# test_data_features = fselect.transform(bag_of_words_test)

# Training

In [10]:
print("Training Naive Bayes...")

model1 = MultinomialNB(alpha=0.0005)
model1.fit( bag_of_words_train, train["cuisine"] )

Training Naive Bayes...


MultinomialNB(alpha=0.0005, class_prior=None, fit_prior=True)

In [11]:
model1_train_pred = cross_val_predict(model1, bag_of_words_train, train.cuisine, cv=2)
print("Naive Bayes accuracy: ", accuracy_score(train.cuisine, model1_train_pred))

Accuracy:  0.717453612913


In [13]:
print("Training stochastic gradient descent...")

model2 = SGDClassifier(loss='modified_huber', n_iter=5, random_state=0, shuffle=True)
model2.fit( bag_of_words_train, train["cuisine"] )

Training stochastic gradient descent...


SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=0, shuffle=True, verbose=0,
       warm_start=False)

In [14]:
model2_train_pred = cross_val_predict(model2, bag_of_words_train, train.cuisine, cv=2)
print("SGD accuracy: ", accuracy_score(train.cuisine, model2_train_pred))

Accuracy:  0.765550359531


In [15]:
print("Training random forest...")
model3 = RandomForestClassifier(n_estimators=100, criterion='gini')
model3.fit(bag_of_words_train, train.cuisine)

Training random forest...


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
model3_train_pred = cross_val_predict(model3, bag_of_words_train, train.cuisine, cv=2)
print("Random forest accuracy: ", accuracy_score(train.cuisine, model3_train_pred))

Random forest accuracy:  0.737416402675


# Model evaluation

In [39]:
# basic comparison (what does model consistently get right or wrong?)
# model may make certain types of errors, but not other types of errors
# we wouldn't see this behavior merely by looking at accuracy scores
print('Pred:', model3_train_pred[30:45])
print('True:', train.cuisine[30:45])

Pred: ['italian' 'cajun_creole' 'mexican' 'thai' 'italian' 'cajun_creole'
 'italian' 'chinese' 'mexican' 'southern_us' 'italian' 'mexican' 'mexican'
 'indian' 'mexican']
True: 30         italian
31    cajun_creole
32         mexican
33            thai
34         italian
35    cajun_creole
36         italian
37        filipino
38     southern_us
39     southern_us
40         italian
41       brazilian
42         mexican
43          indian
44         mexican
Name: cuisine, dtype: object


In [71]:
labels = train.cuisine.unique()
confusion = metrics.confusion_matrix(train.cuisine,model2_train_pred, labels)
for ind in range(0,len(labels)):
    print("Index is", ind, labels[ind])
    print("Most frequently predicted cuisine is", labels[np.argmax(confusion[ind])])
    print(confusion[ind])


Index is 0 greek
Most frequently predicted cuisine is greek
[807  22   1  28   2  20 194  14   4   6   0   0   4   1  45   2   3   1
  17   4]
Index is 1 southern_us
Most frequently predicted cuisine is southern_us
[  20 3274   19   30   19   26  216  135   21   47   10    1  214    6  201
   11   36    5    7   22]
Index is 2 filipino
Most frequently predicted cuisine is filipino
[  1  40 423   5   2   4  31  26  98   6  19  21   5  15  18  19   8  10
   1   3]
Index is 3 indian
Most frequently predicted cuisine is indian
[  22   37   12 2690    9    9   29   57   12    8   27    2    2    4   20
   14    7    1   38    3]
Index is 4 jamaican
Most frequently predicted cuisine is jamaican
[  1  44   7  22 373   2  14  13   4   8   4   1   6   4  13   2   5   0
   1   2]
Index is 5 spanish
Most frequently predicted cuisine is spanish
[ 18  45  11  10   3 414 187  99   6   7   2   4  24   4 118   4  13   1
  12   7]
Index is 6 italian
Most frequently predicted cuisine is italian
[  88  1

In [17]:
p1 = model1.predict_proba( bag_of_words_test )[:,1]
p2 = model2.predict_proba( bag_of_words_test )[:,1]


In [None]:
print("Writing results...")

# output = pd.DataFrame( data = { "id": test["id"], "cuisine": .2*p1 + 1.*p2 } )
# output.to_csv( output_file, index = False, quoting = 3 )