In [1]:
from nltk import word_tokenize
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search
import json
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import word_tokenize

In [3]:
class ExtractRecipe():
    """ 
    Extracts recipe information from JSON.
    """
    def __init__(self, json):
        self.recipe_id = self.set_id(json)
        self.cuisine = self.set_cuisine(json)
        self.ingredients = self.set_ingredients(json)
        self.ingredient_count = len(self.ingredients)
        
    def __str__(self):
        return "ID: %s\nCuisine: %s\nIngredients: %s\nNumber of Ingredients: %s" % (self.recipe_id,
                                    self.cuisine,', '.join(self.ingredients),self.ingredient_count)
    def set_id(self,json):
        """
        sets the recipe id.
        """
        try:
            return json['id']
        except KeyError:
            return '-99'
        
    def set_cuisine(self,json):
        """
        sets the recipe cuisine.
        """
        try:
            return json['cuisine']    
        except KeyError:
            return ''
        
    def set_ingredients(self,json):
        """
        sets the recipe ingredients.
        """
        try:
            return json['ingredients']
        except KeyError:
            return []
        
    def clean_ingredient(self,s):
        """
        returns a cleaned up version of the entered ingredient.
        """
        from re import sub
        return sub('[^A-Za-z0-9]+', ' ', s)
    
    def get_train(self):
        """
        returns a dictionary of data for the training set.
        """
        return {
            'cuisine':self.cuisine,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }
    
    def get_predict(self):
        """
        returns a dictionary of data for predicting recipes.
        """
        return {
            'id':self.recipe_id,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }   


def loadTrainSet(dir='train.json'):
    """
    Read in JSON to create training set.
    """
    import json
    from pandas import DataFrame, Series
    from sklearn.preprocessing import LabelEncoder
    X = DataFrame([ExtractRecipe(x).get_train() for x in json.load(open(dir,'rb'))])
    encoder = LabelEncoder()
    X['cuisine'] = encoder.fit_transform(X['cuisine'])
    return X, encoder

def loadTestSet(dir='test.json'):
    """
    Read in JSON to create test set.
    """
    import json
    from pandas import DataFrame
    return DataFrame([ExtractRecipe(x).get_predict() for x in json.load(open(dir,'rb'))])     

In [4]:
import nltk
from re import sub

def make_unique_ingredients_dict(df):
    ingredients = dict()
    stemmer = nltk.PorterStemmer()
    i = 0

    for row in df.itertuples():
        ingredients_list = row[3].split(', ')

        for ingredient in ingredients_list:
            ingredient = sub('[^A-Za-z]+', '', ingredient)
            words = ingredient.split()
            stemmed_words = []

            for word in words:
                stemmed_words.append(stemmer.stem(word.lower()))

            stemmed_ingredient = ' '.join(stemmed_words)

            if stemmed_ingredient not in ingredients:
                ingredients[stemmed_ingredient] = i
                i = i + 1
    
    return ingredients

In [5]:
import scipy as sp
from sklearn.grid_search import GridSearchCV

from sklearn import cross_validation
from scipy.sparse import dok_matrix

def estimate_acc(clf, train_data, train_labels):
    """Calculate accuracy by CV"""
    
    scores = cross_validation.cross_val_score(clf, train_data, train_labels, cv = 5)

    print('Accuracy on training set: {0} +/- {1}'.format(scores.mean(), scores.std() * 2))
    
def write_submission(reasult_dict):
    """File to upload"""
    writer = csv.writer(open('submission.csv', 'wt'))
    writer.writerow(['id','cuisine'])
    for key, value in result_dict.items():
        writer.writerow([key, value])

def make_preprocessed_matrix(df, unique_ingredients, trash_words):
    """Stemm ingredients in dataframe and put into X"""
    
    X = dok_matrix((df.shape[0], 1 + len(unique_ingredients) ))
    
    stemmer = nltk.PorterStemmer()
    #print df
    for dish_number, row in enumerate(df.itertuples()):
        #print row[3], '\n'
        ingredients_list = row[3].split(', ')
        
        for ingredient in ingredients_list:
            ingredient = sub('[^A-Za-z]+', ' ', ingredient)
            words = ingredient.split()
            stemmed_words = []

            for word in words:
                if word not in trash_words:
                    stemmed_words.append(stemmer.stem(word.lower()))
            #print stemmed_words
            stemmed_ingredient = ' '.join(stemmed_words)
            
            if stemmed_ingredient in unique_ingredients:
                ingredient_index = unique_ingredients[stemmed_ingredient]

                X[dish_number, ingredient_index] = 1
            #print X
        
        X[dish_number, len(unique_ingredients)] = row[2]
        
    return X

def find_ingredients_freq(df, unique_ingredients, trash_words):

    frequences = dict()

    stemmer = nltk.PorterStemmer()

    for row in df.itertuples():
        ingredients_list = row[3].split(', ')

        for ingredient in ingredients_list:
            ingredient = sub('[^A-Za-z]+', ' ', ingredient)
            words = ingredient.split()
            stemmed_words = []

            for word in words:
                if word not in trash_words:
                    stemmed_words.append(stemmer.stem(word.lower()))

            stemmed_ingredient = ' '.join(stemmed_words)
            
            if stemmed_ingredient not in frequences:
                frequences[stemmed_ingredient] = 1
            else: 
                frequences[stemmed_ingredient] += 1
    
    return frequences

def find_unique_words(df):

    all_words = []

    stemmer = nltk.PorterStemmer()

    for row in df.itertuples():
        ingredients_list = row[3].split(', ')

        for ingredient in ingredients_list:
            ingredient = sub('[^A-Za-z]+', ' ', ingredient)
            words = ingredient.split()

            for word in words:
                all_words.append(word.lower())
                
    unique, counts = np.unique(np.array(all_words), return_counts=True)
    unique_words = {}
    
    for i in range (len(unique)):
        unique_words[unique[i]] = counts[i]
    
    return unique_words


#def delete_most_freq_ingredients(X): 


#### Load data as DataFrames

In [6]:
train_df, encoder = loadTrainSet()
test_df = loadTestSet()

#### Find trash words in data. First step after loading.

In [None]:
def make_trash_words(unique_words):
    trash_words = {}
    
    k = 0
    for i in range (len(unique_words.keys())):
        if unique_words.values()[i] == 1:
            trash_words[unique_words.keys()[i]] = k
            k += 1
    
    return trash_words

a = find_unique_words(train_df)
trash_words = make_trash_words(a)

#### Generation of X and y

In [7]:
unique_ingredients = make_unique_ingredients_dict(train_df)
ingredients = train_df['ingredients']

X = make_preprocessed_matrix(train_df, unique_ingredients)
y = train_df['cuisine']

print len(y), X.shape

39774 (39774, 6637)


#### Frequences

In [8]:
#trash_words = {'plain', 'whole', 'veri', 'style', 'all', 'the',
#'a', 'of', 'big', 'with', 'full', 'miniatur', 'and'}
frequences = find_ingredients_freq(train_df, unique_ingredients, trash_words)

In [99]:
#print sorted(frequences.items(), key=lambda x: x[1])

In [None]:
for i in range (len(frequences.keys())):
    if len(frequences.keys()[i].split(' ')) > 2:
        print frequences.keys()[i]
        

In [16]:
len(a)

3014

In [11]:
for i in range (len(a.keys())):
    if a.values()[i] == 1:
        print a.keys()[i]
#print frequences.keys()[0].split(' ')
#print sorted(a.items(), key=lambda x: x[1])       
#trash_words = [plain, whole, veri, style, all, the, a, of, big, with, full, miniatur ]

hyssop
fruitcake
woods
moulard
muscovy
speck
dulong
foccacia
brill
chee
tipo
crunch
jonshonville
jamon
mezzetta
chopmeat
seven
pangasius
meats
estancia
ploy
blueberri
ti
langoustines
mahlab
psyllium
jujubes
india
fondant
panetini
lap
kampyo
maraschino
better
muscavado
budweiser
fil
bream
arrow
alexia
velvet
stolichnaya
boquerones
xuxu
farmhouse
mojo
wieners
panch
ribeye
nakano
ocean
lambic
spreadable
victoria
passover
crosswise
te
shiromiso
dickel
cumberland
chong
garbonzo
tropic
do
cappuccino
poundcake
pocket
linguisa
bag
bai
daiya
artificial
mentaiko
honeysuckle
eggnog
uni
gnocchetti
beetroot
parslei
chua
dictine
lesser
taste
tangzhong
matzos
camellia
wheel
moose
elderflower
margherita
piquin
chartreuse
poolish
cortland
arak
opo
portuguese
filipino
breader
soi
fern
ornamental
ikura
beech
true
mora
wensleydale
blueberry
shin
genoise
mondavi
kha
orchid
klondike
garland
ammonium
phoran
graviera
haloumi
samphire
snip
conchiglie
laksa
membrillo
german
vadouvan
mooli
specials
haricot
cooki

#### Test on smaller data

In [6]:
n = 300
df = train_df[0:n]

unique_ingredients = make_unique_ingredients_dict(df)

ingredients = train_df['ingredients']

Xt = make_preprocessed_matrix(df, unique_ingredients)
yt = train_df['cuisine'][0:n]
print len(yt), Xt.shape

#print train_df['ingredients'][0], train_df['ingredients'][1]

300 (300, 989)


#### Look at mean ingredients through classes

In [85]:
nmb_classes = len(set(train_df['cuisine']))

indexs = {}
means_ingredients = []

for i in range (nmb_classes):
    indexs[i] = []
    
for i in range (len(train_df['ingredient_count'])):
    indexs[train_df['cuisine'][i]].append(i)  

In [84]:
for j in range (nmb_classes):
     means_ingredients.append(np.mean(train_df['ingredient_count'][indexs[i]])) 

#### Normalization

features multiplicated by their importances

#### SVC
Todo normarizate

In [68]:
from sklearn.svm import SVC
clf = LinearSVC(C = 0.09)
estimate_acc(clf, X, y)

Accuracy on training set: 0.783729101479 +/- 0.00774332095281


In [None]:
clf = LinearSVC()
parameters = {
    'C': [0.05, 0.09, 1, 1.7]
}

gs = GridSearchCV(clf, parameters, cv=3, scoring = 'accuracy')
gs.fit(X, y)

print gs.best_params_

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

parameters = {
    'penalty': ['l1', 'l2'],
    'C': [1.3, 1.5, 1.7]
}

gs = GridSearchCV(clf, parameters, cv=3, scoring = 'accuracy')
gs.fit(X, y)

print gs.best_params_

{'penalty': 'l2', 'C': 1.3}


In [74]:
clf = LogisticRegression(penalty = 'l1')
estimate_acc(clf, X, y)

Accuracy on training set: 0.778449185849 +/- 0.00764526771224


In [21]:
clf = LogisticRegression(penalty = 'l2', C = 1.3)
estimate_acc(clf, X, y)

Accuracy on training set: 0.781968682816 +/- 0.00778257043319


#### Bayes

In [60]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()
parameters = {
    'alpha': [0, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9],
    'fit_prior': [True, False]
    }

gs = GridSearchCV(clf, parameters, cv=10, scoring='mean_squared_error')
gs.fit(X, y)
print gs.best_params_

clf = BernoulliNB(alpha = 0.1, fit_prior = True)
estimate_acc(clf, X, y)

{'alpha': 0.1, 'fit_prior': True}
Accuracy on training set: 0.748403970518 +/- 0.00782642342733


In [None]:
from sklearn.ensemble import RandomForestClassifier

### Xgboost


In [None]:
import xgboost as xgb

#dtrain = xgb.DMatrix(X)
#params = {'bst.max.depth': 25, 'eta': 0.3, 'nround': 200, 'objective': "multi:softmax", 'num_class':20}


### Feature Selection

### Forests

find feature importances

In [35]:
from sklearn.tree import DecisionTreeClassifier

forest = DecisionTreeClassifier()
estimate_acc(forest, X, y)

Accuracy on training set: 0.602202368448 +/- 0.00588308033084


In [10]:
forest.fit(X, y)
importances = forest.feature_importances_

In [24]:
#print sorted(importances)
print len(unique_ingredients)
print len(importances)

6681
6682


In [None]:
from sklearn.ensemble import RandomForestClassifier