In [430]:
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
import json

# Read Data Into Pandas DataFrame

In [350]:
with open('train.json','rU') as f:
    data = json.load(f)

In [351]:
cuisines = pd.DataFrame(data)

In [435]:
cuisines.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


# Study The Data

In [353]:
cuisines.shape

(39774, 3)

In [354]:
X = cuisines.ingredients

In [355]:
y = cuisines.cuisine

In [356]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=1)

In [357]:
X_train.shape

(29830L,)

In [358]:
X_test.shape

(9944L,)

In [359]:
X_train = X_train.astype('string') #Convert training data into string

In [360]:
X_train.shape

(29830L,)

In [361]:
X_test = X_test.astype('string') #Convert testing data into string

In [362]:
X_test.shape

(9944L,)

In [363]:
vect = CountVectorizer()

In [364]:
X_train_dtm = vect.fit_transform(X_train) #Create a DTM for training

In [365]:
X_test_dtm = vect.transform(X_test) #Create a DTM for testing. A fit operation is not performed on the test data. This ensures that the shape of the test DTM is consistent with the test dataset'

In [366]:
X_test.shape

(9944L,)

In [367]:
X_train.shape

(29830L,)

In [368]:
X_train_dtm.shape

(29830, 2864)

In [369]:
X_test_dtm.shape

(9944, 2864)

In [370]:
vect.get_feature_names() #get a list of the individual words identified

[u'10',
 u'100',
 u'14',
 u'15',
 u'25',
 u'33',
 u'40',
 u'43',
 u'95',
 u'abalone',
 u'abbamele',
 u'absinthe',
 u'abura',
 u'acai',
 u'accent',
 u'accompaniment',
 u'achiote',
 u'acid',
 u'acini',
 u'ackee',
 u'acorn',
 u'acting',
 u'activ',
 u'active',
 u'added',
 u'adobo',
 u'adzuki',
 u'agar',
 u'agave',
 u'age',
 u'aged',
 u'ahi',
 u'aioli',
 u'ajinomoto',
 u'ajwain',
 u'aka',
 u'alaskan',
 u'albacore',
 u'alcohol',
 u'ale',
 u'aleppo',
 u'alexia',
 u'alfalfa',
 u'alfredo',
 u'all',
 u'allspice',
 u'almond',
 u'almondmilk',
 u'almonds',
 u'alphabet',
 u'alum',
 u'amaranth',
 u'amarena',
 u'amaretti',
 u'amaretto',
 u'amba',
 u'amber',
 u'amberjack',
 u'amchur',
 u'american',
 u'aminos',
 u'ammonium',
 u'amontillado',
 u'ampalaya',
 u'anaheim',
 u'anasazi',
 u'ancho',
 u'anchovies',
 u'anchovy',
 u'and',
 u'andouille',
 u'anejo',
 u'angel',
 u'anglaise',
 u'angled',
 u'angostura',
 u'angus',
 u'anise',
 u'anisette',
 u'anjou',
 u'annatto',
 u'any',
 u'aonori',
 u'apple',
 u'apple

In [436]:
X_train_dtm.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

# Function For Implementing Naive Bayes

In [419]:
def tokenize_test(vect):
    X_train_dtm = vect.fit_transform(X_train)
    print 'Features: ', X_train_dtm.shape[1]
    X_test_dtm = vect.transform(X_test)
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_class = nb.predict(X_test_dtm)
    print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)
    print metrics.confusion_matrix(y_test, y_pred_class)

# Evaluate Model Using Default Count Vectorizer Settings

In [438]:
vect = CountVectorizer()
tokenize_test(vect)

Features:  2864
Accuracy:  0.719831053902
[[  29    1    7    0    1    2    0    4    1   11    1    0    0   13
     0    0   18   11   11    0]
 [   1   80    5    0    0   30    0    7    9    4    2    0    1    0
     1    3   62    4    0    0]
 [   0    6  279    0    0    6    1    2    1   18    0    0    0   12
     2    1   43    2    0    0]
 [   1    2    7  575   10    6    2    4    2    4    1   10    7    0
     1    1    7    0   17   10]
 [   6    0    5   38   82    2    0    4    0    5    0    1    5    7
     0    2   17    1    9    4]
 [   3   24   15    1    2  334   11    2   10  103    0    2    0    7
     4    5  115   16    1    1]
 [   0    9    6    0    0   14  190    2    0   49    0    0    0    4
     9    0   10    8    0    0]
 [   3    5    1    0    1    4    8  650    3    5    1    1    1    8
    19    7   15    0   16    1]
 [   0   11    1    0    0   15    2    2   65    7    0    0    0    4
     2    2   46    3    0    0]
 [   0   31  

# Stop English Words And Evaluate Model

In [439]:
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

Features:  2826
Accuracy:  0.721842316975
[[  29    1    8    0    2    2    0    4    1   11    1    0    0   11
     0    0   17   12   11    0]
 [   1   82    2    0    0   31    0    7    9    5    2    0    1    0
     1    3   62    3    0    0]
 [   0    5  279    0    0    7    1    2    1   18    0    0    0   12
     2    2   42    2    0    0]
 [   1    2    7  574    9    6    2    3    2    4    2   10    7    0
     1    1    8    0   18   10]
 [   6    0    5   38   81    2    0    4    0    5    0    1    5    7
     0    2   18    1    9    4]
 [   3   25   15    1    2  340   10    2    9  105    0    2    0    6
     4    6  109   15    1    1]
 [   0    8    6    0    0   13  189    3    0   50    0    0    0    3
    10    0   11    8    0    0]
 [   3    4    1    0    1    3    8  653    3    6    1    1    1    7
    18    7   15    0   16    1]
 [   0   11    1    0    0   15    2    1   65    7    0    0    0    3
     2    3   47    3    0    0]
 [   0   32  

# Analyze The Features To Determine Repeating Words

In [380]:
vect.get_feature_names()

[u'10',
 u'100',
 u'14',
 u'15',
 u'25',
 u'33',
 u'40',
 u'43',
 u'95',
 u'abalone',
 u'abbamele',
 u'absinthe',
 u'abura',
 u'acai',
 u'accent',
 u'accompaniment',
 u'achiote',
 u'acid',
 u'acini',
 u'ackee',
 u'acorn',
 u'acting',
 u'activ',
 u'active',
 u'added',
 u'adobo',
 u'adzuki',
 u'agar',
 u'agave',
 u'age',
 u'aged',
 u'ahi',
 u'aioli',
 u'ajinomoto',
 u'ajwain',
 u'aka',
 u'alaskan',
 u'albacore',
 u'alcohol',
 u'ale',
 u'aleppo',
 u'alexia',
 u'alfalfa',
 u'alfredo',
 u'allspice',
 u'almond',
 u'almondmilk',
 u'almonds',
 u'alphabet',
 u'alum',
 u'amaranth',
 u'amarena',
 u'amaretti',
 u'amaretto',
 u'amba',
 u'amber',
 u'amberjack',
 u'amchur',
 u'american',
 u'aminos',
 u'ammonium',
 u'amontillado',
 u'ampalaya',
 u'anaheim',
 u'anasazi',
 u'ancho',
 u'anchovies',
 u'anchovy',
 u'andouille',
 u'anejo',
 u'angel',
 u'anglaise',
 u'angled',
 u'angostura',
 u'angus',
 u'anise',
 u'anisette',
 u'anjou',
 u'annatto',
 u'aonori',
 u'apple',
 u'apples',
 u'applesauce',
 u'appl

In [381]:
stop = vect.get_stop_words()

In [382]:
stop = list(stop) # create iterable list of stop words

In [440]:
type(stop)

list

# Find top 20 repeating words

In [441]:
from collections import Counter, defaultdict
import seaborn as sns
from nltk.stem.porter import *
cuisine_dict = defaultdict(Counter)
index_dict = defaultdict(Counter)
total_counter = Counter()
for index, row in cuisines.iterrows():
    arr = row.ingredients
    for ingredient in row.ingredients:
        total_counter[ingredient] += 1
total_counter.most_common(20)

[(u'salt', 18049),
 (u'olive oil', 7972),
 (u'onions', 7972),
 (u'water', 7457),
 (u'garlic', 7380),
 (u'sugar', 6434),
 (u'garlic cloves', 6237),
 (u'butter', 4848),
 (u'ground black pepper', 4785),
 (u'all-purpose flour', 4632),
 (u'pepper', 4438),
 (u'vegetable oil', 4385),
 (u'eggs', 3388),
 (u'soy sauce', 3296),
 (u'kosher salt', 3113),
 (u'green onions', 3078),
 (u'tomatoes', 3058),
 (u'large eggs', 2948),
 (u'carrots', 2814),
 (u'unsalted butter', 2782)]

# Add Top 10 Common Words To The List Of Stop Words

In [443]:
additional_ingredients = ['salt','onion','oliv oil','water','garlic','sugar','garlic','clove','butter','ground','black','pepper','all','purpose','flour']
for i in additional_ingredients:
    stop.append(i)

In [444]:
stop #check list of stop words

[u'all',
 u'six',
 u'less',
 u'be',
 u'inde',
 u'over',
 u'move',
 u'anyway',
 u'four',
 u'not',
 u'own',
 u'through',
 u'yourselv',
 u'fifi',
 u'where',
 u'mill',
 u'onli',
 u'find',
 u'befor',
 u'one',
 u'whose',
 u'system',
 u'how',
 u'somewher',
 u'with',
 u'thick',
 u'show',
 u'had',
 u'enough',
 u'should',
 'to',
 u'must',
 u'whom',
 u'seem',
 u'under',
 u'our',
 u'has',
 u'might',
 u'thereaft',
 u'latter',
 'do',
 u'them',
 u'his',
 u'around',
 u'than',
 u'get',
 u'veri',
 'de',
 u'none',
 u'cannot',
 u'everi',
 u'whether',
 u'they',
 u'front',
 u'dure',
 u'thus',
 u'now',
 u'him',
 u'nor',
 u'name',
 u'sever',
 u'hereaft',
 u'alway',
 u'who',
 u'cri',
 u'whither',
 u'this',
 u'someon',
 u'either',
 u'each',
 u'becom',
 u'thereupon',
 u'sometim',
 u'side',
 u'two',
 u'therein',
 u'twelv',
 u'becaus',
 u'often',
 u'ten',
 u'our',
 'eg',
 u'some',
 u'back',
 'up',
 'go',
 u'name',
 u'toward',
 u'are',
 u'further',
 u'beyond',
 u'ourselv',
 u'yet',
 u'out',
 u'even',
 u'will',
 u'w

# Evaluate model by removing stop words

In [445]:
vect = CountVectorizer(stop_words=stop)
tokenize_test(vect)

Features:  2812
Accuracy:  0.726971037812
[[  29    1    6    0    1    4    1    3    1   10    1    0    0   11
     0    0   15   13   14    0]
 [   0   71    4    0    0   34    0    5   10    9    3    1    1    1
     1    5   57    5    2    0]
 [   0    3  273    0    0   10    1    3    1   17    0    0    0   13
     0    2   47    3    0    0]
 [   2    1    7  581    8    5    2    3    1    4    1   10    7    0
     1    1    8    0   15   10]
 [   5    0    6   41   77    3    0    4    1    3    0    2    4    8
     0    2   16    1   10    5]
 [   3   22   15    2    2  356    7    2   13   97    0    4    0   12
     3    3  100   14    0    1]
 [   0    4    5    0    0   14  191    5    0   51    0    1    0    3
     9    0   12    6    0    0]
 [   4    2    1    0    0    4   11  664    0    7    1    1    1    7
    16    3   11    0   16    0]
 [   0   11    1    1    1   19    2    2   64    6    0    1    0    4
     2    1   44    1    0    0]
 [   0   23  

# Limit features to 2200

In [446]:
vect = CountVectorizer(stop_words=stop,max_features=2200)
tokenize_test(vect)

Features:  2200
Accuracy:  0.727976669348
[[  32    1    6    0    2    4    1    3    1    9    1    0    0   10
     1    0   15   12   12    0]
 [   1   86    5    0    0   26    0    5   13    8    3    1    1    1
     1    5   46    5    2    0]
 [   0    4  276    0    0    8    1    4    2   16    0    0    0   12
     0    4   43    3    0    0]
 [   2    1    6  575    8    5    2    3    0    4    1   10    9    0
     1    3    9    0   15   13]
 [   6    1    6   38   80    3    0    5    1    3    0    2    4    7
     0    2   14    1   10    5]
 [   3   26   15    2    2  361    9    2   17   87    0    4    0   11
     3    4   94   15    0    1]
 [   0    7    7    0    0   14  196    3    0   42    0    1    0    3
    10    0   10    8    0    0]
 [   5    5    1    0    0    4   12  656    2    7    1    1    1    6
    18    3    9    0   18    0]
 [   0   12    0    1    1   15    2    2   73    5    0    1    0    3
     2    3   38    2    0    0]
 [   0   29  

# Function to implement Logistic Regression 

In [449]:
from sklearn.linear_model import LogisticRegression
def logistic_regression(vect):
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    logreg = LogisticRegression(C=1e9)
    logreg.fit(X_train_dtm, y_train)
    y_pred_class = logreg.predict(X_test_dtm)
    print 'Accuracy: ', metrics.accuracy_score(y_test, y_pred_class)
    print metrics.confusion_matrix(y_test, y_pred_class)

# Evaluate Logistic Regression With Default Count Vectorizer Settings

In [453]:
vect = CountVectorizer()
logistic_regression(vect)

Accuracy:  0.720635559131
[[  49    3    4    1    4    5    0    1    0    4    4    2    0   10
     0    1   13    4    4    1]
 [   2   75    2    0    1   17    1    4   22   16    9    4    1    4
     1    8   34    5    1    2]
 [   2    2  233    4    1   10    0    0    2   21    4    0    0   15
     1    9   60    5    1    3]
 [   3    1    3  506   11    8    0    6    3   14    4   28   25    6
     2    2    6    1   25   13]
 [   9    1    2   18   85    5    1    4    3   10    8    8    2    8
     0    4    6    1    5    8]
 [   6   17   14    5    2  346    8    2   18  120   13    4    4   13
    10   17   37   16    1    3]
 [   2    0    0    1    0   12  192    9    3   49    2    1    0    4
     7    5    4    7    1    2]
 [   7    5    0    2    3   10    9  605    2    6    9   18    1   16
    23    7   16    0    6    4]
 [   1   13    1    0    1   10    4    3   78    7    1    3    0    3
     1    2   28    4    0    0]
 [  11    9   16    2    3   

# Evaluate Logistic Regression With Stop Words And Max Features = 2200

In [452]:
vect = CountVectorizer(stop_words=stop,max_features=2200)
logistic_regression(vect)

Accuracy:  0.724859211585
[[  48    2    3    0    4    5    0    1    1    4    3    2    0   12
     0    1   12    5    5    2]
 [   2   80    2    0    1   22    1    2   22   17    6    5    3    5
     0    4   29    5    1    2]
 [   3    3  221    3    1    9    1    0    2   24    5    1    1   16
     1    8   68    4    0    2]
 [   3    2    1  508   11    8    1    4    2   17    3   26   28    7
     0    1    6    0   27   12]
 [   4    2    2   20   88    7    1    4    3    9    4    8    1    9
     0    4    7    1    5    9]
 [   5   17   12    5    1  355    8    3   21  120    8    4    4    6
     8   12   45   18    1    3]
 [   2    1    0    1    0   19  189    8    2   45    2    2    0    4
     7    5    5    7    1    1]
 [   5    5    1    2    3   12    9  606    2   13    4   17    0   16
    21    5   19    0    6    3]
 [   0   12    1    0    0    9    3    3   77   11    3    3    2    3
     0    4   25    4    0    0]
 [   2    9   16    2    1   

# Analyze Cuisines That Have Large Errors In Prediction

In [454]:
sous_vs_brazilian = cuisines[(cuisines.cuisine == 'southern_us') | (cuisines.cuisine == 'brazilian')]

In [461]:
from collections import Counter, defaultdict # Find top 10 repeated ingredients in Brazilian and Southern US Cuisine
from nltk.stem.porter import *
cuisine_dict = defaultdict(Counter)
index_dict = defaultdict(Counter)
total_counter = Counter()
for index, row in sous_vs_brazilian.iterrows():
    arr = row.ingredients
    for ingredient in row.ingredients:
        total_counter[ingredient] += 1
total_counter.most_common(20)

[(u'salt', 2484),
 (u'butter', 1303),
 (u'all-purpose flour', 1240),
 (u'sugar', 1123),
 (u'water', 773),
 (u'large eggs', 764),
 (u'baking powder', 743),
 (u'milk', 689),
 (u'unsalted butter', 678),
 (u'eggs', 674),
 (u'buttermilk', 636),
 (u'onions', 615),
 (u'pepper', 605),
 (u'ground black pepper', 513),
 (u'vegetable oil', 494),
 (u'vanilla extract', 476),
 (u'baking soda', 452),
 (u'olive oil', 430),
 (u'garlic cloves', 384),
 (u'kosher salt', 372)]

In [462]:
stop

[u'all',
 u'six',
 u'less',
 u'be',
 u'inde',
 u'over',
 u'move',
 u'anyway',
 u'four',
 u'not',
 u'own',
 u'through',
 u'yourselv',
 u'fifi',
 u'where',
 u'mill',
 u'onli',
 u'find',
 u'befor',
 u'one',
 u'whose',
 u'system',
 u'how',
 u'somewher',
 u'with',
 u'thick',
 u'show',
 u'had',
 u'enough',
 u'should',
 'to',
 u'must',
 u'whom',
 u'seem',
 u'under',
 u'our',
 u'has',
 u'might',
 u'thereaft',
 u'latter',
 'do',
 u'them',
 u'his',
 u'around',
 u'than',
 u'get',
 u'veri',
 'de',
 u'none',
 u'cannot',
 u'everi',
 u'whether',
 u'they',
 u'front',
 u'dure',
 u'thus',
 u'now',
 u'him',
 u'nor',
 u'name',
 u'sever',
 u'hereaft',
 u'alway',
 u'who',
 u'cri',
 u'whither',
 u'this',
 u'someon',
 u'either',
 u'each',
 u'becom',
 u'thereupon',
 u'sometim',
 u'side',
 u'two',
 u'therein',
 u'twelv',
 u'becaus',
 u'often',
 u'ten',
 u'our',
 'eg',
 u'some',
 u'back',
 'up',
 'go',
 u'name',
 u'toward',
 u'are',
 u'further',
 u'beyond',
 u'ourselv',
 u'yet',
 u'out',
 u'even',
 u'will',
 u'w