In [1]:
from nltk import word_tokenize
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search
from sklearn.linear_model import LogisticRegression
import json
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
class ExtractRecipe():
    """ 
    Extracts recipe information from JSON.
    """
    def __init__(self, json):
        self.recipe_id = self.set_id(json)
        self.cuisine = self.set_cuisine(json)
        self.ingredients = self.set_ingredients(json)
        self.ingredient_count = len(self.ingredients)
        
    def __str__(self):
        return "ID: %s\nCuisine: %s\nIngredients: %s\nNumber of Ingredients: %s" % (self.recipe_id,
                                    self.cuisine,', '.join(self.ingredients),self.ingredient_count)
    def set_id(self,json):
        """
        sets the recipe id.
        """
        try:
            return json['id']
        except KeyError:
            return '-99'
        
    def set_cuisine(self,json):
        """
        sets the recipe cuisine.
        """
        try:
            return json['cuisine']    
        except KeyError:
            return ''
        
    def set_ingredients(self,json):
        """
        sets the recipe ingredients.
        """
        try:
            return json['ingredients']
        except KeyError:
            return []
        
    def clean_ingredient(self,s):
        """
        returns a cleaned up version of the entered ingredient.
        """
        from re import sub
        return sub('[^A-Za-z0-9]+', ' ', s)
    
    def get_train(self):
        """
        returns a dictionary of data for the training set.
        """
        return {
            'cuisine':self.cuisine,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }
    
    def get_predict(self):
        """
        returns a dictionary of data for predicting recipes.
        """
        return {
            'id':self.recipe_id,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }   


def loadTrainSet(dir='train.json'):
    """
    Read in JSON to create training set.
    """
    import json
    from pandas import DataFrame, Series
    from sklearn.preprocessing import LabelEncoder
    X = DataFrame([ExtractRecipe(x).get_train() for x in json.load(open(dir,'rb'))])
    encoder = LabelEncoder()
    X['cuisine'] = encoder.fit_transform(X['cuisine'])
    return X, encoder

def loadTestSet(dir='test.json'):
    """
    Read in JSON to create test set.
    """
    import json
    from pandas import DataFrame
    return DataFrame([ExtractRecipe(x).get_predict() for x in json.load(open(dir,'rb'))])     

In [3]:
import nltk

def make_unique_ingredients_dict(df):
    ingredients = dict()
    stemmer = nltk.PorterStemmer()
    i = 0

    for row in df.itertuples():
        ingredients_list = row[3].split(', ')

        for ingredient in ingredients_list:
            words = ingredient.split()
            stemmed_words = []

            for word in words:
                stemmed_words.append(stemmer.stem(word.lower()))

            stemmed_ingredient = ' '.join(stemmed_words)

            if stemmed_ingredient not in ingredients:
                ingredients[stemmed_ingredient] = i
                i = i + 1
    
    return ingredients

In [50]:
import scipy as sp
from sklearn.grid_search import GridSearchCV

from sklearn import cross_validation
from scipy.sparse import dok_matrix

def estimate_acc(clf, train_data, train_labels):
    """Calculate accuracy by CV"""
    
    scores = cross_validation.cross_val_score(clf, train_data, train_labels, cv = 5)

    print('Accuracy on training set: {0} +/- {1}'.format(scores.mean(), scores.std() * 2))
    
def write_submission(reasult_dict):
    """File to upload"""
    writer = csv.writer(open('submission.csv', 'wt'))
    writer.writerow(['id','cuisine'])
    for key, value in result_dict.items():
        writer.writerow([key, value])

def make_preprocessed_matrix(df, unique_ingridients):
    """Stemm ingredients in dataframe and put into X"""
    
    X = dok_matrix((df.shape[0], 1 + len(unique_ingridients) ))
    
    stemmer = nltk.PorterStemmer()
    #print df
    for dish_number, row in enumerate(df.itertuples()):
        #print row[3], '\n'
        ingridients_list = row[3].split(', ')
        
        for ingridient in ingridients_list:
            words = ingridient.split()
            stemmed_words = []

            for word in words:
                stemmed_words.append(stemmer.stem(word.lower()))
            #print stemmed_words
            stemmed_ingridient = ' '.join(stemmed_words)
            
            #if stemmed_indridient in unique_ingridients:
            ingridient_index = unique_ingridients[stemmed_ingridient]

            X[dish_number, ingridient_index] = 1
            #print X
        
        X[dish_number, len(unique_ingridients) - 1] = row[2]
        
    return X


            

#### Load data as DataFrames

In [None]:
train_df, encoder = loadTrainSet()
test_df = loadTestSet()

#### Generation of X and y

In [36]:
unique_ingredients = make_unique_ingredients_dict(train_df)
ingredients = train_df['ingredients']

X = make_preprocessed_matrix(train_df, unique_ingredients)
y = train_df['cuisine']

print len(y), X.shape

39774 (39774, 6682)


#### Test on smaller data

In [31]:
n = 300
df = train_df[0:n]

unique_ingredients = make_unique_ingredients_dict(df)

ingredients = train_df['ingredients']

X = make_preprocessed_matrix(df, unique_ingredients)
y = train_df['cuisine'][0:n]
print len(y), X.shape

#print train_df['ingredients'][0], train_df['ingredients'][1]

300 (300, 989)


#### SVC

In [30]:
from sklearn.svm import SVC
clf = LinearSVC(C = 0.09)
estimate_acc(clf, X, y)

Accuracy on training set: 0.780743265123 +/- 0.0114403347877
[ 0.77486089  0.78894759  0.78131695  0.7739491   0.78464178]


In [None]:
parameters = {
    'C': [0.1, 1, 10, 15]
}

clf = LinearSVC()
gs = GridSearchCV(clf, parameters, cv=10, scoring='mean_squared_error')
gs.fit(X, y)
print gs.best_params_

#### Bayes

In [60]:
from sklearn.naive_bayes import BernoulliNB

clf = BernoulliNB()
parameters = {
    'alpha': [0, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9],
    'fit_prior': [True, False]
    }

gs = GridSearchCV(clf, parameters, cv=10, scoring='mean_squared_error')
gs.fit(X, y)
print gs.best_params_

clf = BernoulliNB(alpha = 0.1, fit_prior = True)
estimate_acc(clf, X, y)

{'alpha': 0.1, 'fit_prior': True}
Accuracy on training set: 0.748403970518 +/- 0.00782642342733


### Xgboost


In [62]:
import xgboost as xgb

dtrain = xgb.DMatrix(X)
params = {'bst.max.depth': 25, 'eta': 0.3, 'nround': 200, 'objective': "multi:softmax", 'num_class':20}


ImportError: No module named xgboost

### Feature Selection

In [None]:
print 