In [1]:
# Standard Imports
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline



In [2]:
# Performance
from time import time

In [3]:
# Machine Learning
from sklearn.naive_bayes import MultinomialNB

In [4]:
# Helper
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.cross_validation import cross_val_predict, cross_val_score, train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

# Preliminaries

In [5]:
# Load in the Data
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [6]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [7]:
# null accuracy (what is accuracy if always predict most prevalent class)
# this gives an idea of the underlying distribution of response values
train.cuisine.value_counts()
train.cuisine.count()
null_accuracy = train.cuisine.value_counts().max()/train.cuisine.count()

print("Null accuracy is", null_accuracy)

Null accuracy is 0.197063408257


In [8]:
# split data! train-test split
X_supvtrain, X_supvtest, y_supvtrain, y_supvtest = train_test_split(train["ingredients"], train["cuisine"])

# String similarity

In [16]:
from fuzzywuzzy import fuzz
print(fuzz.ratio("cheese mozzarella", "part-skim mozzarella cheese"))
# is the shorter string a substring of the longer
print(fuzz.partial_ratio("cheese mozzarella", "part-skim mozzarella cheese"))
# tokenize string, sort alphabeticallly, and join back into string
print(fuzz.token_sort_ratio("cheese mozzarella", "part-skim mozzarella cheese"))
# tokenize and split tokens into 2 groups - intersection of 2 strings and remainder
print(fuzz.token_set_ratio("cheese mozzarella", "part-skim mozzarella cheese"))

55
71
77
100


In [17]:
print(fuzz.ratio("olives black", "black pepper"))
print(fuzz.partial_ratio("olives black", "black pepper"))
print(fuzz.token_sort_ratio("olives black", "black pepper"))
print(fuzz.token_set_ratio("olives black", "black pepper"))

42
42
58
59


# Processing

In [10]:
# Extract the Unique Ingredients

#training
print("Parsing train ingredients...")
wordsX_train = np.array([' '.join(item) for item in X_supvtrain])

wordsX_test = np.array([' '.join(item) for item in X_supvtest])

#test set
print("Parsing test ingredients...")
words_test = np.array([' '.join(item) for item in test.ingredients])


Parsing train ingredients...
Parsing test ingredients...


In [11]:
# finding ngrams (1,2) seems similar to (1,1)

n = [1, 2, 3, 4]
acc_res = []

for i in range(len(n)):
    print("n is ", n[i])
    print("Testing n-grams...")
    vectorizer = TfidfVectorizer(lowercase=True, stop_words=None, ngram_range=(1, n[i]))
    bag_of_words_Xtrain = vectorizer.fit_transform(wordsX_train)

    print("Training Naive Bayes...")

    model1 = MultinomialNB(alpha=0.0005)
    model1_train_pred = cross_val_predict(model1, bag_of_words_Xtrain, y_supvtrain, cv=10)
    acc = accuracy_score(y_supvtrain, model1_train_pred)
    print("Naive Bayes accuracy: ", acc)
    acc_res.append(acc)

n is  1
Testing n-grams...
Training Naive Bayes...
Naive Bayes accuracy:  0.723768018773
n is  2
Testing n-grams...
Training Naive Bayes...
Naive Bayes accuracy:  0.730271538719
n is  3
Testing n-grams...
Training Naive Bayes...
Naive Bayes accuracy:  0.709252430439
n is  4
Testing n-grams...
Training Naive Bayes...
Naive Bayes accuracy:  0.654542406973


In [12]:
# find best min_df (m = 1). max_df = (0.6 to 1.0 are the same)

m = [1, 5, 10, 20, 30, 50]
acc_res = []

for i in range(len(m)):
    print("m is ", m[i])
    print("Testing min_df...")
    vectorizer = TfidfVectorizer(lowercase=True, stop_words=None, ngram_range=(1, 2), min_df = m[i])
    bag_of_words_Xtrain = vectorizer.fit_transform(wordsX_train)

    print("Training Naive Bayes...")

    model1 = MultinomialNB(alpha=0.0005)
    model1_train_pred = cross_val_predict(model1, bag_of_words_Xtrain, y_supvtrain, cv=10)
    acc = accuracy_score(y_supvtrain, model1_train_pred)
    print("Naive Bayes accuracy: ", acc)
    acc_res.append(acc)

m is  1
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.730271538719
m is  5
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.722292993631
m is  10
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.716694602749
m is  20
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.714750251425
m is  30
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.715018437814
m is  50
Testing min_df...
Training Naive Bayes...
Naive Bayes accuracy:  0.712772376802


In [16]:
# set final tfidf parameters

vectorizer = TfidfVectorizer(lowercase=True, stop_words=None, ngram_range=(1, 2))
bag_of_words_Xtrain = vectorizer.fit_transform(wordsX_train)

# reduce dimension...to 20000 features (with 2-ngrams)
k = [10000, 15000, 20000, 25000]
acc_res = []

for i in range(len(k)):
    print("k is ", k[i])
    print("Reducing dimension...")
    fselect = SelectKBest(k=k[i])
    Xtrain_data_features = fselect.fit_transform(bag_of_words_Xtrain, y_supvtrain)

    print("Training Naive Bayes...")

    model1 = MultinomialNB(alpha=0.0005)
    model1_train_pred = cross_val_predict(model1, Xtrain_data_features, y_supvtrain, cv=10)
    acc = accuracy_score(y_supvtrain, model1_train_pred)
    print("Naive Bayes accuracy: ", acc)
    acc_res.append(acc)

k is  10000
Reducing dimension...
Training Naive Bayes...
Naive Bayes accuracy:  0.755950385518
k is  15000
Reducing dimension...
Training Naive Bayes...
Naive Bayes accuracy:  0.760408984244
k is  20000
Reducing dimension...
Training Naive Bayes...
Naive Bayes accuracy:  0.760811263828
k is  25000
Reducing dimension...
Training Naive Bayes...
Naive Bayes accuracy:  0.757023131076


# Finalize parameters

In [17]:
# stem and clean...
# get tfidf
vectorizer = TfidfVectorizer(lowercase=True, stop_words=None, ngram_range=(1, 2))
bag_of_words_Xtrain = vectorizer.fit_transform(wordsX_train)
bag_of_words_Xtest = vectorizer.transform(wordsX_test)
# # final test set
bag_of_words_test = vectorizer.transform(words_test)


In [18]:
# select best features
fselect = SelectKBest(k=20000)
Xtrain_data_features = fselect.fit_transform(bag_of_words_Xtrain, y_supvtrain)
Xtest_data_features = fselect.transform(bag_of_words_Xtest)

# final test set
test_data_features = fselect.transform(bag_of_words_test)

# Accuracy measure on hold-out set

In [19]:
# Predict class for hold-out set.
model1.fit(Xtrain_data_features, y_supvtrain)
holdout_pred = model1.predict(Xtest_data_features)

In [20]:
accuracy = round(accuracy_score(y_supvtest, holdout_pred), 3)
print("Accuracy:", accuracy)

Accuracy: 0.733
