In [None]:
from imp import reload
from functools import reduce

import pandas as pd
import numpy as np
from scipy import sparse

import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adam
from keras.models import model_from_json
from sklearn.metrics import accuracy_score

from sklearn.cross_validation import StratifiedKFold
from sklearn.svm import SVC

import utils
reload(utils)

# Load data

In [None]:
data = utils.Data(test=True)

ingredient_data = data.ingredient_data
word_data = data.word_data
tfidf_word_data = data.tfidf_word_data

recipes = data.recipes
documents = data.documents

y_model = data.y_model
y = data.y
y_hc = data.y_hc
test_ids = data.test_ids

cv = data.cv10

# NN

First model is neural network. It uses concatenated word_data and ingredient_data

In [None]:
x = sparse.hstack((ingredient_data, word_data)).toarray().astype(np.int8)
x_test = sparse.hstack((data.ingredient_test, data.word_test)).toarray().astype(np.int8)

In [None]:
cv = data.cv10
nb_epoch = 50
batch_size = 2048

model = Sequential()
model.add(Dropout(0.5, input_shape=(x.shape[1],)))
model.add(Dense(800, init='he_normal', activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(400, init='he_normal', activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(200, init='he_normal', activation='tanh'))
model.add(Dropout(0.5))
model.add(Dense(y_hc.shape[1], init='he_normal', activation='softmax'))

optimizer = Adam()
model.compile(optimizer=optimizer, loss='categorical_crossentropy')

model_description = model.to_json()

In [None]:
print('predicting for train set')

predictions = []
for i, (train, test) in enumerate(cv):
    model = model_from_json(model_description)
    model.fit(x[train], y_hc[train], nb_epoch=nb_epoch, batch_size=batch_size)
    p = model.predict(x[test], batch_size=batch_size)
    p.dump('nn_train_{}'.format(i))
    print(i)

In [None]:
for i in range(10):
    predictions = [np.load('nn_train_{}'.format(i)) for i in range(10)]

prediction = np.empty(y_hc.shape)
for p, (train, test) in zip(predictions, cv):
    prediction[test] = p
    
np.save('nn_train', prediction)

In [None]:
model = model_from_json(model_description)
model.fit(x, y_hc, nb_epoch=nb_epoch, batch_size=batch_size)

In [None]:
test_prediction = model.predict(x_test, batch_size=batch_size)
np.save('nn_test', test_prediction)

# SVC

In [None]:
gamma = 1
C = 3.1622776601683795
clf = SVC(gamma=gamma, C=C, probability=True)

data.predict_cv(tfidf_word_data, y, data.tfidf_word_test, clf, 'SVC_rbf')

# Layer 2

In [31]:
data_names = ['nn', 'SVC_rbf']

predictions = []
for name in data_names:
    name += '_train.npy'
    res = np.load(name)
    print(accuracy_score(y, np.argmax(res, axis=1)))
    predictions.append(res)

0.814300799517
0.815457333937


In [32]:
accuracy_score(y, np.argmax(np.mean(np.array(predictions), axis=0), axis=1))

0.82332679639965811

In [None]:
predictions = []
for name in data_names:
    name += '_test.npy'
    p = np.load(name)
    predictions.append(p)

In [None]:
prediction = np.argmax(np.mean(np.array(predictions), axis=0), axis=1)

In [None]:
def make_submission(p, filename):
    p = data.y_model.inverse_transform(p)
    sub = pd.DataFrame({'cuisine': p}, index=test_ids)
    sub.index.name='id'
    sub.to_csv(filename)

make_submission(prediction, 'submission_mean.csv')