# Submission 6

In [2]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_selection import SelectKBest
from sklearn.linear_model import SGDClassifier, LassoCV
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
import json
import random

### Loads training data

In [3]:
data = json.loads(open('data/train.json').read())

#   replacing spaces with underlines, make into strings for count vectorizer
X_full = ([[ingredient.replace(' ', '_') for ingredient in food['ingredients']]\
      for food in data])

#  ^Look at the badass list comprehension right there
X_full = [' '.join(ingredients) for ingredients in X_full]

y_full = map(lambda x: x['cuisine'], data)

X_full, y_full = np.asarray(X_full), np.asarray(y_full)

assert len(X_full) == len(y_full)

### Loads test data

In [4]:
test_data = json.loads(open('data/test.json').read())

X_out_test = ([[ingredient.replace(' ', '_').encode('ascii', 'ignore')\
                for ingredient in food['ingredients']]\
                for food in test_data])
X_out_test = [' '.join(ingredients) for ingredients in X_out_test]
X_out_test = np.asarray(X_out_test)

In [5]:
print '\n\n'.join(X_full[:5])

romaine_lettuce black_olives grape_tomatoes garlic pepper purple_onion seasoning garbanzo_beans feta_cheese_crumbles

plain_flour ground_pepper salt tomatoes ground_black_pepper thyme eggs green_tomatoes yellow_corn_meal milk vegetable_oil

eggs pepper salt mayonaise cooking_oil green_chilies grilled_chicken_breasts garlic_powder yellow_onion soy_sauce butter chicken_livers

water vegetable_oil wheat salt

black_pepper shallots cornflour cayenne_pepper onions garlic_paste milk butter salt lemon_juice water chili_powder passata oil ground_cumin boneless_chicken_skinless_thigh garam_masala double_cream natural_yogurt bay_leaf


In [23]:
full_clf = Pipeline([('vect', CountVectorizer(max_df=.7)),
                     ('tfidf', TfidfTransformer()),
                     ('kbest', SelectKBest(k=4900)),
                     ('clf', SGDClassifier(alpha=1e-5,
                                           n_iter=70, 
                                           penalty='elasticnet'))])

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full,
                                                    test_size=.2, random_state=7)
full_clf.fit(X_train, y_train)
preds = full_clf.predict(X_test)
np.mean(preds == y_test)

0.78227529855436828

In [43]:
scores = list()

for a in [1e1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5, 1e-6, 1e-7]:
    temp_clf = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('kbest', SelectKBest(k=4900)),
                         ('clf', SGDClassifier(alpha=a))])
    
    temp_clf.fit(X_train, y_train)
    preds = temp_clf.predict(X_test)
    scores.append((a, np.mean(preds == y_test)))

In [41]:
print '\n'.join(['{}, {}'.format(x[0], x[1]) for x in scores])

10.0, 0.256442489001
0.1, 0.286109365179
0.01, 0.659836580767
0.001, 0.694657448146
0.0001, 0.769201759899
1e-05, 0.7795097423
1e-06, 0.752231301069
1e-07, 0.72155876807


In [42]:
for score in scores:
    print 'k: {}, accuracy: {}'.format(score[0], score[1])

k: 10.0, accuracy: 0.256442489001
k: 0.1, accuracy: 0.286109365179
k: 0.01, accuracy: 0.659836580767
k: 0.001, accuracy: 0.694657448146
k: 0.0001, accuracy: 0.769201759899
k: 1e-05, accuracy: 0.7795097423
k: 1e-06, accuracy: 0.752231301069
k: 1e-07, accuracy: 0.72155876807


Makes a csv from predictions, gets stuff ready to submit

In [25]:
df = pd.DataFrame(columns=['id','cuisine'])
df['id'] = map(lambda x: x['id'], test_data)

preds = full_clf.predict(X_out_test)
df['cuisine'] = preds
df.index=df.id
df = df.drop('id', axis=1)
df.head()

df.to_csv('submissions/submission_9.csv')

In [70]:
len(X_full)

39774