In [1]:
# Standard Imports
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# Performance
from time import time

# Machine Learning
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB

# Helper
import os, sys
help_path = os.path.split(os.getcwd())[0]
sys.path.append(help_path)
from helper_nlp import featureExtraction
from helper_models import searchHyperparameters

from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.cross_validation import cross_val_predict, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics



# Preliminaries

In [2]:
# Load in the Data
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [3]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [5]:
# null accuracy (what is accuracy if always predict most prevalent class)
# this gives an idea of the underlying distribution of response values
train.cuisine.value_counts()
train.cuisine.count()
null_accuracy = train.cuisine.value_counts().max()/train.cuisine.count()
print(null_accuracy)

0.197063408257


# Feature extraction

In [6]:
# see helper_nlp.py for class and methods
text = featureExtraction(train.ingredients)
trainAsStrings = text.listToString()
stemmedStrings = text.stem(trainAsStrings)
vec = text.tfidfVectorize(stemmedStrings, max_features=2000, ngram_range=(1,1), \
               lowercase=True, stop_words=None, max_df=0.5, min_df= 1)
bow_train = text.bag_of_words(vec, stemmedStrings)

In [7]:
# split training data
X_train, X_test, y_train, y_test = train_test_split(bow_train, train["cuisine"])

# Feature selection

In [None]:
# to-do; fuzzy wuzzy (before feature extraction?); hierarchical selection

In [None]:
# reduce dimension...
# from sklearn.feature_selection.univariate_selection import SelectKBest, chi2, f_classif
# fselect = SelectKBest(chi2 , k=70000)
# train_data_features = fselect.fit_transform(bag_of_words_train, train["cuisine"])
# test_data_features = fselect.transform(bag_of_words_test)

# Training

In [11]:
# instantiate algorithm
alg1 = MultinomialNB()
# define parameter values to be searched
alpha = (0.2, 0.1, 0.05)
param_grid = dict(alpha = alpha)
# instantiate searchHyperparameters()
searchHP_Bayes = searchHyperparameters(alg1, param_grid, cval = 10, score = None)
# run grid search
grid = searchHP_Bayes.fullGrid(X_train, y_train)
results = searchHP_Bayes.results(grid)


Time elapsed (s) is: 0.6241753339767456
Overall results: [mean: 0.72652, std: 0.00781, params: {'alpha': 0.2}, mean: 0.73155, std: 0.00775, params: {'alpha': 0.1}, mean: 0.73134, std: 0.00783, params: {'alpha': 0.05}]
Best score: 0.73154542407
Best parameters: {'alpha': 0.1}
Best model: MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)


In [18]:
# instantiate algorithm
alg2 = SGDClassifier()
# define parameter values to be searched
loss = ('modified_huber',)
alpha = (0.0001, 0.00001) # constant that multiplies the regularization term
penalty = ('elasticnet',) # regularization term
n_iter = (20, 30)
param_grid = dict(alpha=alpha, loss=loss, penalty=penalty, n_iter=n_iter)
# instantiate searchHyperparameters()
searchHP_SGD = searchHyperparameters(alg2, param_grid, cval = None, score = None)
# run grid search
grid = searchHP_SGD.fullGrid(X_train, y_train)
results = searchHP_SGD.results(grid)


Time elapsed (s) is: 23.3721724152565
Overall results: [mean: 0.77161, std: 0.00264, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 20, 'alpha': 0.0001}, mean: 0.77472, std: 0.00057, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 30, 'alpha': 0.0001}, mean: 0.73553, std: 0.00363, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 20, 'alpha': 1e-05}, mean: 0.73989, std: 0.00376, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 30, 'alpha': 1e-05}]
Best score: 0.774723432786
Best parameters: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 30, 'alpha': 0.0001}
Best model: SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='modified_huber', n_iter=30, n_jobs=1,
       penalty='elasticnet', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)


In [None]:
#  mean: 0.57767, std: 0.00147, params: {'penalty': 'elasticnet', 'loss': 'log',          'n_iter': 20, 'alpha': 0.001}, 
# [mean: 0.65970, std: 0.00221, params: {'penalty': 'elasticnet', 'loss': 'hinge',        'n_iter': 20, 'alpha': 0.001},
#  mean: 0.73553, std: 0.00363, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 20, 'alpha': 1e-05}, 
#  mean: 0.73989, std: 0.00376, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 30, 'alpha': 1e-05}]
#   mean: 0.73533, std: 0.00344, params: {'penalty': 'l2',        'loss': 'log',           'n_iter': 10, 'alpha': 0.0001}, 
#  mean: 0.73909, std: 0.00349, params: {'penalty': 'l2',         'loss': 'log',           'n_iter': 5, 'alpha': 0.0001}, 
# mean: 0.75612, std: 0.00671, params: {'penalty': 'elasticnet',  'loss': 'modified_huber', 'n_iter': 5, 'alpha': 0.0001}, 
# mean: 0.76131, std: 0.00244, params: {'penalty': 'elasticnet',  'loss': 'hinge',         'n_iter': 10, 'alpha': 0.0001}]
#  mean: 0.76312, std: 0.00341, params: {'penalty': 'elasticnet', 'loss': 'hinge',         'n_iter': 10, 'alpha': 0.0001}]
#  mean: 0.76430, std: 0.00056, params: {'penalty': 'elasticnet', 'loss': 'hinge',          'n_iter': 10, 'alpha': 0.0001}]
# mean: 0.76470, std: 0.00412, params: {'penalty': 'l2',           'loss': 'modified_huber', 'n_iter': 10, 'alpha': 0.0001}, 
# [mean: 0.76473, std: 0.00180, params: {'penalty': 'elasticnet', 'loss': 'hinge',           'n_iter': 20, 'alpha': 0.0001},
#   mean: 0.76772, std: 0.00341, params: {'penalty': 'l2',         'loss': 'hinge',         'n_iter': 20, 'alpha': 0.0001}, 
#   mean: 0.76899, std: 0.00229, params: {'penalty': 'l2',        'loss': 'hinge',          'n_iter': 10, 'alpha': 0.0001},
#   [mean: 0.77124, std: 0.00315, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 10, 'alpha': 0.0001}, 
# [mean: 0.77161, std: 0.00264, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 20, 'alpha': 0.0001},
#  mean: 0.77342, std: 0.00243, params: {'penalty': 'l2',          'loss': 'modified_huber', 'n_iter': 20, 'alpha': 0.0001}, 
#   mean: 0.77395, std: 0.00192, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 20, 'alpha': 0.0001}, 
#  mean: 0.77472, std: 0.00057, params: {'penalty': 'elasticnet', 'loss': 'modified_huber', 'n_iter': 30, 'alpha': 0.0001}, 

In [None]:
# instantiate algorithm
alg3 = RandomForestClassifier()
# define parameter values to be searched
n_estimators = (10, 50, 100, 150)
criterion = ('gini', 'entropy')
max_features = ("sqrt", "auto") # constant that mulitplies the regularization term
max_depth = (3, None)
param_grid = dict(n_estimators=n_estimators, criterion=criterion, max_features=max_features, \
                  max_depth=max_depth)
# instantiate searchHyperparameters()
searchHP_rf = searchHyperparameters(alg3, param_grid, cval = None, score = None)
# run grid search
grid = searchHP_rf.randomGrid(4, X_train, y_train)
results = searchHP_rf.results(grid)

In [15]:
print("Training random forest...")
model3 = RandomForestClassifier(n_estimators=100, criterion='gini')
model3.fit(bag_of_words_train, train.cuisine)

Training random forest...


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
model3_train_pred = cross_val_predict(model3, bag_of_words_train, train.cuisine, cv=2)
print("Random forest accuracy: ", accuracy_score(train.cuisine, model3_train_pred))

Random forest accuracy:  0.737416402675


# Model evaluation

In [19]:
# with-held sample
holdout_pred = grid.predict(X_test)
print("SGD accuracy: ", accuracy_score(y_test, holdout_pred))

SGD accuracy:  0.791934835076


In [39]:
# basic comparison (what does model consistently get right or wrong?)
# model may make certain types of errors, but not other types of errors
# we wouldn't see this behavior merely by looking at accuracy scores
print('Pred:', model3_train_pred[30:45])
print('True:', train.cuisine[30:45])

Pred: ['italian' 'cajun_creole' 'mexican' 'thai' 'italian' 'cajun_creole'
 'italian' 'chinese' 'mexican' 'southern_us' 'italian' 'mexican' 'mexican'
 'indian' 'mexican']
True: 30         italian
31    cajun_creole
32         mexican
33            thai
34         italian
35    cajun_creole
36         italian
37        filipino
38     southern_us
39     southern_us
40         italian
41       brazilian
42         mexican
43          indian
44         mexican
Name: cuisine, dtype: object


In [71]:
labels = train.cuisine.unique()
confusion = metrics.confusion_matrix(train.cuisine,model2_train_pred, labels)
for ind in range(0,len(labels)):
    print("Index is", ind, labels[ind])
    print("Most frequently predicted cuisine is", labels[np.argmax(confusion[ind])])
    print(confusion[ind])


Index is 0 greek
Most frequently predicted cuisine is greek
[807  22   1  28   2  20 194  14   4   6   0   0   4   1  45   2   3   1
  17   4]
Index is 1 southern_us
Most frequently predicted cuisine is southern_us
[  20 3274   19   30   19   26  216  135   21   47   10    1  214    6  201
   11   36    5    7   22]
Index is 2 filipino
Most frequently predicted cuisine is filipino
[  1  40 423   5   2   4  31  26  98   6  19  21   5  15  18  19   8  10
   1   3]
Index is 3 indian
Most frequently predicted cuisine is indian
[  22   37   12 2690    9    9   29   57   12    8   27    2    2    4   20
   14    7    1   38    3]
Index is 4 jamaican
Most frequently predicted cuisine is jamaican
[  1  44   7  22 373   2  14  13   4   8   4   1   6   4  13   2   5   0
   1   2]
Index is 5 spanish
Most frequently predicted cuisine is spanish
[ 18  45  11  10   3 414 187  99   6   7   2   4  24   4 118   4  13   1
  12   7]
Index is 6 italian
Most frequently predicted cuisine is italian
[  88  1

In [17]:
p1 = model1.predict_proba( bag_of_words_test )[:,1]
p2 = model2.predict_proba( bag_of_words_test )[:,1]


# Final step - submission

In [31]:
# see helper_nlp.py for methods
testText = featureExtraction(test.ingredients)
testAsStrings = testText.listToString() # transform list to strings   
testStemmedStrings = testText.stem(testAsStrings)  # stem the strings
bow_test = testText.bag_of_words(vec, testStemmedStrings)  # use the trained vec to transform testStrings

In [34]:
# Make predictions for test set.
test_pred = grid.predict(bow_test)

In [36]:
# Create a new dataframe with only the columns Kaggle wants from the dataset.
submission = pd.DataFrame({
        "id": test["id"],
        "cuisine": test_pred
    })

# Any files you save will be available in the output tab below
submission.to_csv('submission1.csv', index=False)