In [30]:
# Standard Imports
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

# Performance
from time import time

# Machine Learning
from sklearn.ensemble import GradientBoostingClassifier

# Helper
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
# feature selection
from sklearn.feature_selection import SelectKBest, chi2
# cross-val
from sklearn.cross_validation import cross_val_predict, cross_val_score, train_test_split
# evaluation
from sklearn.metrics import accuracy_score
from sklearn import metrics
# nlp
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Pre-processing

In [2]:
# Load in the training data
train = pd.read_json('train.json')
# test = pd.read_json('test.json')

In [3]:
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
# GradientBoostingClassifier?

# Feature extraction

In [4]:
class featureExtraction(object):
    """
    featureExtraction represents the stage in which the text data are processed,
    or reduced from words/strings to numbers
    """
    
    def __init__(self, data):
        """Assign raw data. Assumes dataframe """
        self.data = data
        
    def listToString(self):
        """Extract the unique ingredients. Assumes list of strings and 
        returns a string.
        E.g. input = ['romaine lettuce', 'black olives', 'salt', ...]
        E.g. output = 'romaine lettuce black olives salt ...'
        """ 
        words = [' '.join(item) for item in self.data]
        return words
    
    def stem(self, words):
        stemmedTokens = [stemmer.stem(w) for w in words] 
        return stemmedTokens
    
    def countVectorize(self, stemmedTokens, max_features=2500, ngram_range=(1,1), \
                  lowercase=True, stop_words=None, max_df=0.5, min_df= 1):
        """Tokenize and count words.
        1. Instantiate vectorizer 'vec'
        2. Fit: learn vocabulary and idf from training set"""
        
        vec = CountVectorizer(max_features=max_features, ngram_range=ngram_range, \
                              lowercase=lowercase, stop_words=stop_words, max_df=max_df, min_df= min_df)
        vec.fit(stemmedTokens)
    
        return vec
    
    def tfidfVectorize(self, stemmedTokens, max_features=2500, ngram_range=(1,1), \
                  lowercase=True, stop_words=None, max_df=0.5, min_df= 1):
        """Tokenize, count, and weight the words.
        1. Instantiate vectorizer 'vec'
        2. Fit: learn vocabulary and idf from training set"""
        
        vec = TfidfVectorizer(max_features=max_features, ngram_range=ngram_range, \
                              lowercase=lowercase, stop_words=stop_words, max_df=max_df, min_df= min_df)
        vec.fit(stemmedTokens)
        return vec
    
    def bag_of_words(self, vec, stemmedTokens):
        """Transform documents to document-term matrix"""
        bag_of_words = vec.transform(stemmedTokens).toarray()
        return bag_of_words

In [5]:
# feature extraction for training data
text = featureExtraction(train.ingredients)
trainAsStrings = text.listToString()
stemmedStrings = text.stem(trainAsStrings)
vec = text.tfidfVectorize(stemmedStrings, max_features=2000, ngram_range=(1,1), \
               lowercase=True, stop_words=None, max_df=0.5, min_df= 1)
bow_train = text.bag_of_words(vec, stemmedStrings)
bow_train.shape

(39774, 2000)

In [10]:
bow_train[0]

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

# Estimator

In [6]:
# split training data
X_train, X_test, y_train, y_test = train_test_split(bow_train, train["cuisine"])

In [21]:
start = time()
# fit estimator
est = GradientBoostingClassifier(n_estimators=10, max_depth=1, \
                                 max_features = 4, subsample = 0.0005)
est.fit(X_train, y_train)

print("\nTime elapsed (s) is:", time() - start)


Time elapsed (s) is: 951.3565318584442


In [22]:
# predict class labels
pred = est.predict(X_test)

# score on test data (accuracy)
acc = est.score(X_test, y_test)
print('ACC: %.4f' % acc)

# predict class probabilities
est.predict_proba(X_test)[0]

ACC: 0.1693


array([ 0.01710979,  0.02005241,  0.02120421,  0.05777035,  0.02171655,
        0.06850386,  0.03575906,  0.09357744,  0.020346  ,  0.11397561,
        0.02925108,  0.0474852 ,  0.04003642,  0.1165517 ,  0.04607847,
        0.01712492,  0.087594  ,  0.03291803,  0.03833207,  0.07461283])

In [44]:
class searchHyperparameters(object):
    """
    Search a model's hyperparameters using either RandomizedSearchCV or
    GridSearchCV. Print duration of the grid search and a 'results' method
    reveals the best parameters.
    
    Example usage:
        # instantiate algorithm
        alg = GradientBoostingClassifier()
        # define parameter values to be searched
        n_estimators = [1, 3, 5, 10]
        max_depth = [2, ]
        max_features = [1, 4, 9, 12]
        subsample = [0.001, ]
        param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, \
                         max_features = max_features, subsample = subsample)
        # instantiate searchHyperparameters()
        searchHP = searchHyperparameters(alg, param_grid, cval = None, score = None)
        # run grid search
        grid = searchHP.randomGrid(2, X_train, y_train)
        results = searchHP.results(grid)
    """
    def __init__(self, alg, param_grid, cval, score):
        """Give it an instantiated algorithm """
        self.alg = alg
        self.param_grid = param_grid
        self.cval = cval
        self.score = score
        
    def randomGrid(self, n_iter, X, y):
        start = time()
        grid = RandomizedSearchCV(self.alg, self.param_grid, n_iter = n_iter, cv = self.cval, scoring = self.score)
        grid.fit(X,y)
        print("\nTime elapsed (s) is:", time() - start)
        return grid
    
    def fullGrid(self, X, y):
        start = time()
        grid = GridSearchCV(self.alg, self.param_grid, cv = self.cval, scoring = self.score)
        grid.fit(X,y)
        print("\nTime elapsed (s) is:", time() - start)
        return grid
    
    def results(self, grid):
        print("Overall results:", grid.grid_scores_)
        print("Best score:", grid.best_score_)
        print("Best parameters:", grid.best_params_)
        print("Best model:", grid.best_estimator_)

        # note if SD high, cross-val estimates may not be reliable
        results = grid.grid_scores_
        return results

In [55]:
# Initialize our algorithm
alg = GradientBoostingClassifier()
# define parameter values to be searched
n_estimators = [10]
max_depth = [1, ]
max_features = [1, ]
subsample = [0.001, 0.004]
param_grid = dict(n_estimators = n_estimators, max_depth = max_depth, \
                 max_features = max_features, subsample = subsample)
print(param_grid)

{'n_estimators': [10], 'max_features': [1], 'max_depth': [1], 'subsample': [0.001, 0.004]}


In [None]:
# instantiate
searchHP = searchHyperparameters(alg, param_grid, None, None)
# run grid search
grid = searchHP.fullGrid(X_train, y_train)
# results = searchHP.results(grid)

In [53]:
results = searchHP.results(grid)

Overall results: [mean: 0.18703, std: 0.00634, params: {'n_estimators': 5, 'max_features': 1, 'max_depth': 1, 'subsample': 0.003}, mean: 0.22142, std: 0.02409, params: {'n_estimators': 10, 'max_features': 1, 'max_depth': 1, 'subsample': 0.001}]
Best score: 0.221421387865
Best parameters: {'n_estimators': 10, 'max_features': 1, 'max_depth': 1, 'subsample': 0.001}
Best model: GradientBoostingClassifier(init=None, learning_rate=0.1, loss='deviance',
              max_depth=1, max_features=1, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=10,
              presort='auto', random_state=None, subsample=0.001,
              verbose=0, warm_start=False)


In [43]:
# [mean: 0.13973, std: 0.01171, params: {'n_estimators': 3, 'max_features': 12, 'max_depth': 2, 'subsample': 0.001}, 
#  mean: 0.16500, std: 0.00425, params: {'n_estimators': 3, 'max_features': 9, 'max_depth': 2, 'subsample': 0.001}]
[mean: 0.18703, std: 0.00634, params: {'n_estimators': 5, 'max_features': 1, 'max_depth': 1, 'subsample': 0.003},
 # [mean: 0.18877, std: 0.02000, params: {'n_estimators': 1, 'max_features': 12, 'max_depth': 2, 'subsample': 0.001},
    
#  mean: 0.19323, std: 0.02066, params: {'n_estimators': 10, 'max_features': 4, 'max_depth': 2, 'subsample': 0.001}]
#  mean: 0.20164, std: 0.01122, params: {'n_estimators': 10, 'max_features': 1, 'max_depth': 1, 'subsample': 0.001}]
# [mean: 0.21039, std: 0.01131, params: {'n_estimators': 5, 'max_features': 1, 'max_depth': 1, 'subsample': 0.001}, 
 mean: 0.22142, std: 0.02409, params: {'n_estimators': 10, 'max_features': 1, 'max_depth': 1, 'subsample': 0.001}]


[mean: 0.18877, std: 0.02000, params: {'n_estimators': 1, 'max_features': 12, 'max_depth': 2, 'subsample': 0.001},
 mean: 0.16500, std: 0.00425, params: {'n_estimators': 3, 'max_features': 9, 'max_depth': 2, 'subsample': 0.001}]

In [None]:
class BoostClassifier(object):
    """
    class Boost implements a boost model.
    
    Set: 
    number of trees - unlike bagging & random forests, boosting can overfit
     if # of trees is too large, although this occurs slowly. Use cross-val to 
     select # of trees.
    learning rate - this controls the rate at which boosting learns. Typical 
     values are 0.01 and 0.001, and the right choice depends on the problem.
     Very small learning rate requires using larger # of trees. 
    number of splits (d) - often d = 1 works well (a stump). In this case, the
     boosted ensemble is fitting an additive model, since each term involves
     only a single variable. 
    """
    
    def __init__(self, X_train, y_train):
        """ """
        self.X_train = X_train
        self.y_train = y_train
        
    def gradientBoost(self, loss='deviance', learning_rate=0.1, n_estimators=100, subsample=1.0, \
                 min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, \
                 max_depth=3, init=None, random_state=None, max_features=None, verbose=0, \
                 max_leaf_nodes=None, warm_start=False, presort='auto'):
        
        start = time()
        # fit estimator
        est = GradientBoostingClassifier(loss=loss, learning_rate=learning_rate, n_estimators=n_estimators, \
                                         subsample=subsample, min_samples_split=min_samples_split, \
                                         min_samples_leaf=min_samples_leaf, min_weight_fraction_leaf=min_weight_fraction_leaf,\
                                         max_depth=max_depth, init=init, random_state=random_state, max_features=max_features, \
                                         verbose=verbose, max_leaf_nodes=max_leaf_nodes, warm_start=warm_start, presort=presort)
        est.fit(X_train, y_train)

        print("\nTime elapsed (s) is:", time() - start)
        
    def adaBoost(self, base_estimator=None, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None):
        
        start = time()
        # fit estimator
        est = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=n_estimators, learning_rate=learning_rate, \
                                 algorithm=algorithm, random_state=random_state)
        est.fit(X_train, y_train)

        print("\nTime elapsed (s) is:", time() - start)