In [1]:
# Standard Imports
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

In [2]:
# Performance
from time import time

In [10]:
# Machine Learning
from sklearn.ensemble import RandomForestClassifier

In [13]:
# Helper
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import accuracy_score

In [4]:
# Load in the Data
train = pd.read_json('train.json')
test = pd.read_json('test.json')

In [7]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


# Feature extraction

In [5]:
# Extract the Unique Ingredients
words_train = [' '.join(item) for item in train.ingredients]
words_test = [' '.join(item) for item in test.ingredients]

## Tf-idf term weighting

Term-frequency x inverse document-frequency: a term weighting scheme to re-weight count features for very frequent words

TfidfVectorizer combines all the options of CountVectorizer and TfidfTransformer (where re-weighting occurs) into a single model

stop_words='english'

ngram_range : tuple (min_n, max_n)

max_df : ignore terms that have a doc frequency strictly higher than given threshold

min_df : ignore terms that have doc freq strictly lower than given threshold

max_features : if not None, build vocab that only considers top max_features ordered by term frequency across corpus

In [6]:
vectorizer = TfidfVectorizer(max_df=0.5)
bag_of_words_train = vectorizer.fit_transform(words_train)
bag_of_words_test = vectorizer.transform(words_test)

In [16]:
bag_of_words_train.shape

(39774, 3049)

## Grid Search CV ##

In [19]:
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [20]:
# define a pipeline combining a text feature extractor with a simple
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
])

In [40]:
# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.10, 0.15),
    #'vect__max_features': (None, 1000),
    #'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    #'tfidf__use_idf': (True, False),
    #'tfidf__norm': ('l1', 'l2'),
    'clf__n_estimators': (50, 60, 70),
    #'clf__criterion': ('gini', 'entropy')
}

In [41]:
from pprint import pprint

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(words_train, train.cuisine)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__n_estimators': (50, 60, 70), 'vect__max_df': (0.1, 0.15)}
Fitting 3 folds for each of 6 candidates, totalling 18 fits

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   26.4s
[Parallel(n_jobs=1)]: Done  18 out of  18 | elapsed:  9.2min finished



done in 615.013s

Best score: 0.745
Best parameters set:
	clf__n_estimators: 70
	vect__max_df: 0.15


# Feature selection

- ## SelectPercentile

In [8]:
selector = SelectPercentile(f_classif, percentile = 10)
selector.fit(bag_of_words_train, train.cuisine)
new_bag_of_words_train = selector.transform(bag_of_words_train).toarray()
new_bag_of_words_test = selector.transform(bag_of_words_test).toarray()

In [11]:
## Random Forest Classification
random_forest = RandomForestClassifier(n_estimators=200, criterion='gini')
start = time()
random_forest.fit(new_bag_of_words_train, train.cuisine)
print("RandomForest Training finished in %.2f s" % (time() - start))

RandomForest Training finished in 55.44 s


In [14]:
# Basic Evaluation on Training Set
start = time()
train_pred = cross_val_predict(random_forest, new_bag_of_words_train, train.cuisine, cv=2)
print("RandomForest Evaluation finished in %.2f s" % (time() - start))

RandomForest Evaluation finished in 50.69 s


In [15]:
# Display Accuracy
print("Accuracy: ", accuracy_score(train.cuisine, train_pred))

Accuracy:  0.718534721175
