In [1]:
import numpy as np
import pandas as pd
import pprint as pp
import itertools as it
from sklearn import cross_validation as cv, naive_bayes as nb, linear_model as lm

In [2]:
train_data = pd.read_json('data/train.json') # parse training set (given in .json format)
test_data = pd.read_json('data/test.json')   # parse testing set (given in .json format)

In [3]:
# B
ingredients_train = np.array(train_data['ingredients']) # save training ingredients
ingredients_test = np.array(test_data['ingredients'])   # save testing ingredients
cuisines_train = np.array(train_data['cuisine'])        # save training cuisines
# find unique cuisines and ingredients
unique_cuisine_train = np.unique(cuisines_train) # finds unique elements. can also be done using sets
# uses itertools chain method to append all sub-lists from a list of lists, and then find unique elements
unique_ingredients_train = np.unique(list(it.chain.from_iterable(ingredients_train)))

print "Number of samples in the training data set: {0}".format(len(train_data))
print "Number of unique cuisine catagories: {0}".format(len(unique_cuisine_train))
print "Number of unique ingredients in the training set: {0}".format(len(unique_ingredients_train))

Number of samples in the training data set: 39774
Number of unique cuisine catagories: 20
Number of unique ingredients in the training set: 6714


In [4]:
# C
def binary_feature_vectorization(unique_ingredients, ingredient_lists):
    feature_vectors = [] # list to contain all feature vectors
    for ingredients in ingredient_lists:
        # create vector of length 'd', with each element instantiated to '0'
        bin_feat_vect= np.zeros(len(unique_ingredients))
        for ingredient in ingredients: # for each ingredient
            if ingredient in unique_ingredients: # if it is in the recipe
                # set this ingredient to '1', indicating its presence in the recipe
                bin_feat_vect[np.where(unique_ingredients == ingredient)[0][0]] = 1
        feature_vectors.append(bin_feat_vect) # add to list
    return np.array(feature_vectors)

feature_vectors_train = binary_feature_vectorization(unique_ingredients_train, ingredients_train)
feature_vectors_test = binary_feature_vectorization(unique_ingredients_train, ingredients_test)

In [5]:
# D, E, F
# use 3 different classifiers to fit the data, and perform 3 fold cross validation on each
gnb = nb.GaussianNB().fit(feature_vectors_train, cuisines_train) 
bnb = nb.BernoulliNB().fit(feature_vectors_train, cuisines_train)
lr = lm.LogisticRegression().fit(feature_vectors_train, cuisines_train)
# cross validation, k=3
scores_gauss = cv.cross_val_score(gnb, feature_vectors_train, cuisines_train, cv=3)
scores_bernoulli = cv.cross_val_score(bnb, feature_vectors_train, cuisines_train, cv=3)
scores_linear = cv.cross_val_score(lr, feature_vectors_train, cuisines_train, cv=3)

print "Number of folds: 3"
print "Mean accuracy Gaussian: ", scores_gauss.mean()
print "Mean accuracy Bernoulli: ", scores_bernoulli.mean()
print "Mean accuracy Linear: ", scores_linear.mean()

Number of folds: 3
Mean accuracy Gaussian:  0.38215893891
Mean accuracy Bernoulli:  0.683587657646
Mean accuracy Linear:  0.775758670409


In [6]:
# G
# predict test cuisine labels using fitted training data
test_predict = lr.predict(feature_vectors_test)
# store predictions in a dataframe, and save it to a csv file in the required column format
submission = pd.DataFrame({'id': test_data['id'], 'cuisine': test_predict})[['id', 'cuisine']]
submission.to_csv('cuisine_submission.csv', index=False)
print submission

         id       cuisine
0     18009       british
1     28583   southern_us
2     41580       italian
3     29752  cajun_creole
4     35687       italian
5     38527   southern_us
6     19666       spanish
7     41217       chinese
8     28753       mexican
9     22659       british
10    21749       italian
11    44967         greek
12    42969        indian
13    44883       italian
14    20827   southern_us
15    23196        french
16    35387       mexican
17    33780   southern_us
18    19001       mexican
19    16526   southern_us
20    42455      japanese
21    47453        indian
22    42478       spanish
23    11885    vietnamese
24    16585       italian
25    29639   southern_us
26    26245    vietnamese
27    38516        korean
28    47520       italian
29    26212       italian
...     ...           ...
9914  49157    vietnamese
9915  40847       italian
9916  14084        french
9917   6802       italian
9918  22381   southern_us
9919  21016     brazilian
9920  29024 