### Sk_horosh_gus

*  Mikhail Usvyatsov
*  Ekaterina Yakovleva
*  Anastasia Makarova

#### Result $0.81245$



The winning code consists of simple data cleaning and neural network with tuned parameters

### Data Cleaning

* Get rid of symbols except letters
* Make letters lower cases
* PorterStemmer 

### Feature processing

* Instead of ingredints, single words were used as features
* *TD-IDF* and other feature processing tools were not used :)

### Structure of Neural Network
##### 4 layers: 

* Input $l_1$

* 2 Hidden layers: Dropout $l_2$, Original $l_3$

* Output Dropout $l_4$ layer.

Links to the Dropout layers are zero out with the probabilities $p_2 = 0.4$ and $p_4 = 0.5$ respectively.

Number of units on the Original hidden layer $7000$, activation function - ReLU

* Number of epoch $= 400$
* Learning rate $= 0.01$
* Update momentum (nesterov momentum) $= 0.4$





### Data preproccessing

In [1]:
import json
import nltk
import re

train_data = json.load(open('train.json'))
test_data = json.load(open('test.json'))

def build_dict(train_data):
    ingredient_dictionary = dict()
    cuisine_dictionary = dict()
    inverse_cuisine_dictionary = dict()
    stemmer = nltk.PorterStemmer()
    regex = re.compile('[^a-zA-Z]')

    i = 0
    j = 0

    for example in train_data:
        if example['cuisine'] not in cuisine_dictionary:
            cuisine_dictionary[example['cuisine']] = j
            inverse_cuisine_dictionary[j] = example['cuisine']
            j += 1
        
        for ingredient in example['ingredients']:
            for word in ingredient.split():
                stemmed_word = stemmer.stem(regex.sub('', word.lower()))
                        
                if stemmed_word not in ingredient_dictionary and len(stemmed_word) > 0:
                    ingredient_dictionary[stemmed_word] = i
                    i += 1

    return ingredient_dictionary, cuisine_dictionary, inverse_cuisine_dictionary
        
ingredient_dictionary, cuisine_dictionary, inverse_cuisine_dictionary = build_dict(train_data)

In [2]:
from scipy.sparse import lil_matrix
import numpy as np

def build_matrix(examples, ingredient_dictionary, cuisine_dictionary):
    matrix = np.zeros((len(examples), len(ingredient_dictionary)), dtype = np.float32)
    answers = []
    stemmer = nltk.PorterStemmer()
    regex = re.compile('[^a-zA-Z]')

    for index, example in enumerate(examples):
        for ingredient in example['ingredients']:
            for word in ingredient.split():
                stemmed_word = stemmer.stem(regex.sub('', word.lower()))
                
                if len(stemmed_word) > 0 and stemmed_word in ingredient_dictionary:
                    matrix[index, ingredient_dictionary[stemmed_word]] = 1
        
        answers.append(cuisine_dictionary[example['cuisine']])

    return matrix, answers

train_data_matrix, train_data_labels = build_matrix(train_data, ingredient_dictionary, cuisine_dictionary)

from sklearn.cross_validation import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_data_matrix, train_data_labels, test_size = 0.33, random_state = 0)

In [4]:
from sklearn.cross_validation import train_test_split

def more_features_matrix(dataset, ingredient_dictionary, cuisine_dictionary):
    initial_matrix, _ = build_matrix(dataset, ingredient_dictionary, cuisine_dictionary)
    ingredients_len = np.zeros((initial_matrix.shape[0], 1), dtype = np.float32)
    
    for index, example in enumerate(dataset):
        ingredients_len[index] = len(dataset[index]['ingredients'])

    return np.hstack((initial_matrix, ingredients_len))

train_data_matrixa = more_features_matrix(train_data, ingredient_dictionary, cuisine_dictionary)

Xa_train, Xa_val, ya_train, ya_val = train_test_split(train_data_matrixa, train_data_labels, test_size = 0.33, random_state = 0)

### Neural Network

In [5]:
import numpy as np
import theano
import theano.tensor as T
import lasagne
from nolearn.lasagne import NeuralNet
from lasagne.updates import nesterov_momentum

Using gpu device 0: GeForce GTX 765M


In [80]:
def build_network(features_number, cuisine_dictionary, max_epochs = 30, update_momentum = 0.4):
    network = NeuralNet(
        layers = [
            ('input', lasagne.layers.InputLayer),
            (lasagne.layers.DropoutLayer, {'p' : 0.4}),
            (lasagne.layers.DenseLayer, {'num_units' : 7000, 'nonlinearity': lasagne.nonlinearities.rectify}),
            (lasagne.layers.DropoutLayer, {'p' : 0.5}),
            ('output', lasagne.layers.DenseLayer),
        ],
        input_shape = (None, features_number),
        output_nonlinearity = lasagne.nonlinearities.softmax,
        output_num_units = len(cuisine_dictionary),

        # optimization method:
        update = nesterov_momentum,
        update_learning_rate = 0.01,
        update_momentum = update_momentum, 

        regression = False,  # flag to indicate we're dealing with regression problem
        max_epochs = max_epochs,
        verbose = 1
    )
    
    return network

def check_accuracy(network, X_val, y_val):
    prediction = network.predict(X_val.astype(np.float32))
    
    count = 0
    
    for i, answer in enumerate(prediction):
        if answer == y_val[i]:
            count += 1
    
    count /= len(y_val)
    return count

### Testing

In [None]:
network = build_network(Xa_train.shape[1], cuisine_dictionary, 1600)
network.fit(Xa_train, np.array(y_train, dtype = np.int32))

In [None]:
check_accuracy(network, Xa_val, ya_val)

In [None]:
network = build_network(X_train.shape[1], cuisine_dictionary, 173)
network.fit(X_train, np.array(y_train, dtype = np.int32))

In [None]:
check_accuracy(network, X_val, y_val)

In [None]:
# network = build_network(train_data_matrix.shape[1], cuisine_dictionary, 173)
network.fit(train_data_matrix, np.array(train_data_labels, dtype = np.int32))

In [None]:
network.save_params_to('80_full.txt')

In [47]:
def make_test_matrices(data, ingredient_dictionary, cuisine_dictionary):
    matrix = np.zeros((len(data), len(ingredient_dictionary)), dtype = np.float32)
    stemmer = nltk.PorterStemmer()
    regex = re.compile('[^a-zA-Z]')
    ids = []

    for index, example in enumerate(data):
        for ingredient in example['ingredients']:
            for word in ingredient.split():
                stemmed_word = stemmer.stem(regex.sub('', word.lower()))
                
                if len(stemmed_word) > 0 and stemmed_word in ingredient_dictionary:
                    matrix[index, ingredient_dictionary[stemmed_word]] = 1
        
        ids.append(example['id'])

    return matrix, ids

In [48]:
test_data_matrix, ids = make_test_matrices(test_data, ingredient_dictionary, cuisine_dictionary)

In [99]:
answers = network.predict(test_data_matrix)

In [95]:
def write_answers(answers, ids, inverse_cuisine_dictionary):
    with open('answers.txt', 'w') as f:
        f.write('id,cuisine\n')
        
        for index, answer in enumerate(answers):
            f.write('{0},{1}\n'.format(ids[index], inverse_cuisine_dictionary[answer]))
        
        f.close()

In [115]:
write_answers(answers, ids, inverse_cuisine_dictionary)

In [None]:
network = build_network(train_data_matrix.shape[1], cuisine_dictionary, 173)

In [None]:
network.load_params_from('82_full.dat')

In [None]:
network.fit(train_data_matrix, np.array(train_data_labels, dtype = np.int32))

In [None]:
network.save_params_to('82_full.dat')

In [108]:
proba = network.predict_proba(test_data_matrix)

In [112]:
proba[:, 9] = 0

In [114]:
answers = np.argmax(proba, axis=1)

In [68]:
from sklearn.feature_extraction.text import TfidfTransformer

transformer = TfidfTransformer()
tf_idf_m = transformer.fit_transform(train_data_matrix, train_data_labels).todense().astype(np.float32)

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_val, y_train, y_val = train_test_split(tf_idf_m, train_data_labels, test_size = 0.33, random_state = 0)

In [None]:
network = build_network(train_data_matrix.shape[1], cuisine_dictionary, 400, 0.3)
network.load_params_from('80.93_big_full.dat')
network.fit(train_data_matrix, np.array(train_data_labels, dtype = np.int32))

In [None]:
network = build_network(train_data_matrix.shape[1], cuisine_dictionary, 400, 0.3)
network.load_params_from('81.281_big_full.dat')
network.predict()

In [39]:
network = build_network(X_train.shape[1], cuisine_dictionary, 250, 0.5)

In [None]:
network.load_params_from('81_full.dat')

In [97]:
network.save_params_to('80.93_big_full.dat')

In [41]:
params = network.get_all_params_values()

with open('81_full_another.dat', 'wb') as f:
    pickle.dump(params, f, protocol=2)

In [None]:
pickle.dump(network.get_all_params_values(), open('81_full_another.dat', 'w'), protocol = 2)