In [2]:
# Standard Imports
import pandas as pd
import numpy as np

In [3]:
# Performance
from time import time

In [4]:
# Machine Learning
from sklearn.ensemble import RandomForestClassifier

In [5]:
# Helper
from sklearn.cross_validation import cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
# Load in the Data
train = pd.read_json('train.json')

In [7]:
# Extract the Unique Ingredients
words = [' '.join(item) for item in train.ingredients]

## CountVectorizer ##

Convert a collection of text documents to a (sparse) matrix of token counts. 

*stop_words*='english'

*ngram_range* : tuple (min_n, max_n)

**Implements both tokenization and occurrence counting.**

- tokenization extracts words of at least 2 letters

- counting is achieved by assigning a unique integer index to each term corresponding to a column in the resulting matrix

- note: words not seen in training corpus will be completely ignored in future calls to the **transform** method

- to preserve local ordering information, extract 2-grams of words in addition to 1-grams


In [20]:
# Construct the Bag of Words
vec = CountVectorizer(max_features=2000, ngram_range=(1, 2), lowercase=True, \
                     stop_words=None)
bag_of_words = vec.fit_transform(words).toarray()

## Random Forest Classifier ##

*n_estimators* = number of trees in the forest

*criterion* = 'gini' (for Gini impurity) or "entropy" (for information gain)

*max_features* = "auto" (that is, sqrt(n_features))

*max_depth* = None (The maximum depth of the tree. If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples. Ignored if max_leaf_nodes is not None.)

In [21]:
## Random Forest Classification
random_forest = RandomForestClassifier(n_estimators=200, criterion='gini')

##random_forest.fit## 

Build a forest of trees from the training set (X, y)

In [22]:
# Recored the time it takes to perform the search
start = time()
random_forest.fit(bag_of_words, train.cuisine)
print("RandomForest Training finished in %.2f s" % (time() - start))

RandomForest Training finished in 170.30 s


##cross_val_predict##

Generate cross-validated estimates for each input data point

*estimator* = random_forest (estimator object implementing 'fit' and 'predict')

*x* = bag_of_words (the data to fit)

*y* = train.cuisine (the target variable to try to predict in case of supervised learning)

*cv* = cross-validation generator or int (if int, determines the number of folds in StratifiedKFold, if y is binary/multiclass, or number of folds in KFold otherwise)

In [23]:
# Basic Evaluation on Training Set
start = time()
train_pred = cross_val_predict(random_forest, bag_of_words, train.cuisine, cv=2)
print("RandomForest Evaluation finished in %.2f s" % (time() - start))

RandomForest Evaluation finished in 148.57 s


##accuracy_score##

In multilabel classification, this function computes subset accuracy: the set of labels predicted for a sample must exactly match the corresponding set of labels in y_true.

In [24]:
# Display Accuracy
print("Accuracy: ", accuracy_score(train.cuisine, train_pred))

Accuracy:  0.730804042842


In [13]:
# Load in Testing Data
test = pd.read_json('test.json')

In [19]:
test.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [14]:
# Create test Bag of Words
test_words = [' '.join(item) for item in test.ingredients]
test_bag = vec.transform(test_words).toarray()

##random_forest.predict##

Predict class for X

In [15]:
# Run the Prediction
result = random_forest.predict(test_bag)