In [49]:
from nltk import word_tokenize
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import nltk
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search
from sklearn.linear_model import LogisticRegression
import json
from sklearn.base import BaseEstimator, TransformerMixin
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [50]:
class ExtractRecipe():
    """ 
    Class that extracts recipe information from JSON.
    """
    def __init__(self, json):
        self.recipe_id = self.set_id(json)
        self.cuisine = self.set_cuisine(json)
        self.ingredients = self.set_ingredients(json)
        self.ingredient_count = len(self.ingredients)
        
    def __str__(self):
        return "ID: %s\nCuisine: %s\nIngredients: %s\nNumber of Ingredients: %s" % (self.recipe_id,
                                    self.cuisine,', '.join(self.ingredients),self.ingredient_count)
    def set_id(self,json):
        """
        Method that sets the recipe id.
        """
        try:
            return json['id']
        except KeyError:
            return '-99'
        
    def set_cuisine(self,json):
        """
        Method that sets the recipe cuisine.
        """
        try:
            return json['cuisine']    
        except KeyError:
            return ''
        
    def set_ingredients(self,json):
        """
        Method that sets the recipe ingredients.
        """
        try:
            return json['ingredients']
        except KeyError:
            return []
        
    def clean_ingredient(self,s):
        """
        Method that returns a cleaned up version of the entered ingredient.
        """
        from re import sub
        return sub('[^A-Za-z0-9]+', ' ', s)
    
    def get_train(self):
        """
        Method that returns a dictionary of data for the training set.
        """
        return {
            'cuisine':self.cuisine,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }
    
    def get_predict(self):
        """
        Method that returns a dictionary of data for predicting recipes.
        """
        return {
            'id':self.recipe_id,
            'ingredients':', '.join([self.clean_ingredient(x) for x in self.ingredients]),
            'ingredient_count':self.ingredient_count
        }   


def loadTrainSet(dir='train.json'):
    """
    Read in JSON to create training set.
    """
    import json
    from pandas import DataFrame, Series
    from sklearn.preprocessing import LabelEncoder
    X = DataFrame([ExtractRecipe(x).get_train() for x in json.load(reader(open(dir,'rb')))])
    encoder = LabelEncoder()
    X['cuisine'] = encoder.fit_transform(X['cuisine'])
    return X, encoder

def loadTestSet(dir='test.json'):
    """
    Read in JSON to create test set.
    """
    import json
    from pandas import DataFrame
    return DataFrame([ExtractRecipe(x).get_predict() for x in json.load(reader(open(dir,'rb')))])    

In [102]:
import nltk

train_df, encoder = loadTrainSet()

def make_ingridients_dict(df):
    ingridients = dict()
    stemmer = nltk.PorterStemmer()
    i = 0

    for row in df.itertuples():
        ingridients_list = row[3].split(', ')

        for ingridient in ingridients_list:
            words = ingridient.split()
            stemmed_words = []

            for word in words:
                stemmed_words.append(stemmer.stem(word.lower()))

            stemmed_ingridient = ' '.join(stemmed_words)

            if stemmed_ingridient not in ingridients:
                ingridients[stemmed_ingridient] = i
                i = i + 1
    
    return ingridients


ingridients = make_ingridients_dict(train_df)

In [121]:
import scipy as sp
from scipy.sparse import csr_matrix

def make_preprocessed_matrix(df):
    preprocessed_training_matrix = sp.sparse.csr_matrix((train_df.shape[0], 1 + len(ingridients)))

    for example_number, row in enumerate(df.itertuples()):
        ingridients_list = row[3].split(', ')
        
        for ingridient in ingridients_list:
            words = ingridient.split()
            stemmed_words = []

            for word in words:
                stemmed_words.append(stemmer.stem(word.lower()))

            stemmed_ingridient = ' '.join(stemmed_words)
            ingridient_index = ingridients[stemmed_ingridient]
            preprocessed_training_matrix[example_number, ingridient_index] = 1
        
        preprocessed_training_matrix[example_number, len(ingridients) - 1] = row[2]
        
    return preprocessed_training_matrix

In [122]:
g = make_preprocessed_matrix(train_df)



KeyboardInterrupt: 