In [1]:
from __future__ import print_function
import numpy as np
import pandas as pd
%matplotlib inline

In [2]:
# Load training data
train = pd.read_json('../yummly/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


## Data Preprocessing - Exploration

In [3]:
train.shape

(39774, 3)

In [4]:
train.isnull().sum()

cuisine        0
id             0
ingredients    0
dtype: int64

In [5]:
train.dtypes

cuisine        object
id              int64
ingredients    object
dtype: object

In [6]:
train.loc[0,'ingredients']

[u'romaine lettuce',
 u'black olives',
 u'grape tomatoes',
 u'garlic',
 u'pepper',
 u'purple onion',
 u'seasoning',
 u'garbanzo beans',
 u'feta cheese crumbles']

In [7]:
type(train.loc[0,'ingredients'])

list

In [8]:
train.cuisine.nunique()

20

In [34]:
train.groupby('cuisine').count()
train.cuisine.value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

### *Further Data Preprocessing - Cleaning/Exploration/Feature selection:*
(Suggestions)

* Convert all letters into lowercase
* Strip unicode
* Strip punctuation such as semicolons and commas
* Strip parantheses and the strings they enclose
* Do food descriptors add value to prediction or not
* Do brand names of ingredients add value to prediction or not
* Remove common ingredients such as salt
* Remove 10 least frequently occuring ingredients in each cuisine
* Tf-idf

## Feature Engineering 

In [10]:
train['num_ingredients'] = train.ingredients.apply(len)

In [11]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,indian,22213,"[water, vegetable oil, wheat, salt]",4
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20


In [12]:
train.groupby('cuisine').num_ingredients.describe().unstack()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
cuisine,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
brazilian,467,9.520343,5.555139,2,5,9,13,59
british,804,9.708955,4.165011,2,7,9,12,30
cajun_creole,1546,12.617076,4.611601,2,9,12,16,31
chinese,2673,11.982791,4.042125,2,9,12,14,38
filipino,755,10.0,3.855135,2,7,10,12,38
french,2646,9.817838,4.144744,1,7,9,12,31
greek,1175,10.182128,3.729461,1,7,10,12,27
indian,3003,12.705961,5.016806,1,9,12,16,49
irish,667,9.29985,3.700505,2,7,9,12,27
italian,7838,9.909033,3.806708,1,7,10,12,65


In [13]:
train['ingredient_length'] = train.ingredients.apply(lambda x:np.mean([len(item) for item in x]))

In [14]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1


In [134]:
#train.groupby('cuisine').ingredient_length.describe().unstack()

### *Further Feature Engineering:*
(Suggestions)

* Stemming
* Lemmatization    
* Bigrams
* Build Similarity groups 

## Training Model

In [16]:
train['ingredients_str'] = train.ingredients.astype(str)

In [17]:
train.loc[0,'ingredients_str']

"[u'romaine lettuce', u'black olives', u'grape tomatoes', u'garlic', u'pepper', u'purple onion', u'seasoning', u'garbanzo beans', u'feta cheese crumbles']"

In [145]:
X = train.ingredients_str
y = train.cuisine
type(y)

pandas.core.series.Series

#### Count Vectorizer

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer()
vect

CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern=u'(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [20]:
X_dtm = vect.fit_transform(X)
X_dtm.shape

(39774, 3028)

In [25]:
print(vect.get_feature_names()[0:100])

[u'a taste of thai rice noodles', u'abalone', u'abbamele', u'absinthe', u'abura age', u'acai juice', u'accent', u'accent seasoning', u'accompaniment', u'achiote', u'achiote paste', u'achiote powder', u'acini di pepe', u'ackee', u'acorn squash', u'active dry yeast', u'adobo', u'adobo all purpose seasoning', u'adobo sauce', u'adobo seasoning', u'adobo style seasoning', u'adzuki beans', u'agar', u'agar agar flakes', u'agave nectar', u'agave tequila', u'aged balsamic vinegar', u'aged cheddar cheese', u'aged gouda', u'aged manchego cheese', u'ahi', u'ahi tuna steaks', u'aioli', u'ajinomoto', u'ajwain', u'aka miso', u'alaskan king crab legs', u'alaskan king salmon', u'albacore', u'albacore tuna in water', u'alcohol', u'ale', u'aleppo', u'aleppo pepper', u'alexia waffle fries', u'alfalfa sprouts', u'alfredo sauce', u'alfredo sauce mix', u'all beef hot dogs', u'all potato purpos', u'all purpose seasoning', u'all purpose unbleached flour', u'allspice', u'allspice berries', u'almond butter', u'a

In [37]:
vect = CountVectorizer(token_pattern=r"'([a-z ]+)'")
X_dtm = vect.fit_transform(X)
X_dtm
X_dtm.shape

(39774, 6250)

In [39]:
print(vect.get_feature_names()[0:100])

[u'a taste of thai rice noodles', u'abalone', u'abbamele', u'absinthe', u'abura age', u'acai juice', u'accent', u'accent seasoning', u'accompaniment', u'achiote', u'achiote paste', u'achiote powder', u'acini di pepe', u'ackee', u'acorn squash', u'active dry yeast', u'adobo', u'adobo all purpose seasoning', u'adobo sauce', u'adobo seasoning', u'adobo style seasoning', u'adzuki beans', u'agar', u'agar agar flakes', u'agave nectar', u'agave tequila', u'aged balsamic vinegar', u'aged cheddar cheese', u'aged gouda', u'aged manchego cheese', u'ahi', u'ahi tuna steaks', u'aioli', u'ajinomoto', u'ajwain', u'aka miso', u'alaskan king crab legs', u'alaskan king salmon', u'albacore', u'albacore tuna in water', u'alcohol', u'ale', u'aleppo', u'aleppo pepper', u'alexia waffle fries', u'alfalfa sprouts', u'alfredo sauce', u'alfredo sauce mix', u'all beef hot dogs', u'all potato purpos', u'all purpose seasoning', u'all purpose unbleached flour', u'allspice', u'allspice berries', u'almond butter', u'a

In [41]:
X_train_tokens = vect.get_feature_names()

In [43]:
X_train_counts = np.sum(X_dtm.toarray(), axis = 0)

In [45]:
X_train_counts.shape

(6250,)

In [129]:
#pd.DataFrame({'token':X_train_tokens, 'count':X_train_counts}).sort('count', ascending=True).head(10)

In [58]:
print(y[0])

greek


In [73]:
indian_dtm = X_dtm[np.array(y == 'indian'), :]
greek_dtm = X_dtm[np.array(y == 'greek'), :]

In [74]:
indian_counts = np.sum(indian_dtm.toarray(),axis=0)
greek_counts = np.sum(greek_dtm.toarray(),axis=0)

In [126]:
#pd.DataFrame({'token':X_train_tokens, 'count':indian_counts}).sort('count', ascending=False).head(20)
#pd.DataFrame({'token':X_train_tokens, 'count':greek_counts}).sort('count', ascending=False).head(20)

In [125]:
token_counts = pd.DataFrame({'token':X_train_tokens, 'indian':indian_counts,'greek':greek_counts})
#token_counts

In [130]:
token_counts['indian'] = token_counts.indian + 1
token_counts['greek'] = token_counts.greek + 1

#### Optimizing Count Vectorizer

In [135]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [143]:
# define a function that accepts a vectorizer and calculates the AUC
def tokenize_test(vect):
    
    # Create document-term matrices using the vectorizer
    X_train_dtm = vect.fit_transform(X_train)
    X_test_dtm = vect.transform(X_test)
    
    # Print the number of features that were generated
    print('Features:', X_train_dtm.shape[1])
    
    # Use Multinomial Naive Bayes to calculate predicted probabilities
    nb = MultinomialNB()
    nb.fit(X_train_dtm, y_train)
    y_pred_prob = nb.predict_proba(X_test_dtm)[:, 1]
    pred = nb.predict(X_test_dtm)
    
    # Print the AUC
    #print('AUC:', metrics.roc_auc_score(y_test, y_pred_prob))
    #print(classification_report(y_test, pred,target_names=news_test.target_names))
    print(score())

In [142]:
# confirm that the AUC is identical to task 5 when using the default parameters
from sklearn import metrics
vect = CountVectorizer()
tokenize_test(vect)

Features: 2864


#### Multinomail Naive Bayes

In [28]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

#### *Algorithms to compare:*
(suggestions)

* Logistic Regression
* Random forests
* Deep Neural Networks

#### Building Pipeline

In [29]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(CountVectorizer(), MultinomialNB())

#### Cross Validation

In [81]:
from sklearn.cross_validation import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()

0.72326159492833919

In [87]:
from sklearn.cross_validation import StratifiedKFold
cv = StratifiedKFold(y,n_folds = 6)
#cross_val_score(pipe, X, y, cv, scoring='accuracy').mean()

### Optimizing Hyper Parameters 

#### RandomizedSearchCV

In [115]:
from sklearn.grid_search import RandomizedSearchCV

In [116]:
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1, 2, 3, 4]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
np.random.seed(1)

In [117]:
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=18, random_state=1)
%time rand.fit(X, y)

CPU times: user 1min 36s, sys: 2.5 s, total: 1min 39s
Wall time: 1min 40s


RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=Non..., vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params={}, iid=True, n_iter=18, n_jobs=1,
          param_distributions={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"], 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x10ff46510>, 'countvectorizer__min_df': [1, 2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='accuracy', verbose=0)

In [118]:
rand.grid_scores_

[mean: 0.74986, std: 0.00494, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.417022004702574, 'countvectorizer__min_df': 2},
 mean: 0.72462, std: 0.00446, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.7203244934421581, 'countvectorizer__min_df': 1},
 mean: 0.72723, std: 0.00601, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.00011437481734488664, 'countvectorizer__min_df': 4},
 mean: 0.74933, std: 0.00356, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.30233257263183977, 'countvectorizer__min_df': 4},
 mean: 0.71941, std: 0.00456, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.14675589081711304, 'countvectorizer__min_df': 4},
 mean: 0.75190, std: 0.00436, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.0923385947687978, 'countvectorizer__min_df': 1},
 mean: 0.7529

In [119]:
print(rand.best_score_)
print(rand.best_params_)

0.752903907075
{'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.1862602113776709, 'countvectorizer__min_df': 1}


#### GridSearchCV

In [120]:
from sklearn.grid_search import GridSearchCV

In [121]:
# create a grid of parameters to search (and specify the pipeline step along with the parameter)
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = [r"\b\w\w+\b", r"'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1, 2, 3]
param_grid['multinomialnb__alpha'] = [0,0.5,1]
np.random.seed(1)

In [122]:
# pass the pipeline (instead of just the model) to GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
%time grid.fit(X, y)

CPU times: user 1min 31s, sys: 2.47 s, total: 1min 34s
Wall time: 1min 34s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('countvectorizer', CountVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=Non..., vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"], 'multinomialnb__alpha': [0, 0.5, 1], 'countvectorizer__min_df': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [127]:
# examine the scores for each combination of parameters
grid.grid_scores_

[mean: 0.68565, std: 0.00426, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0, 'countvectorizer__min_df': 1},
 mean: 0.72439, std: 0.00472, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.5, 'countvectorizer__min_df': 1},
 mean: 0.72326, std: 0.00484, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 1, 'countvectorizer__min_df': 1},
 mean: 0.66106, std: 0.00576, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0, 'countvectorizer__min_df': 1},
 mean: 0.74770, std: 0.00460, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.5, 'countvectorizer__min_df': 1},
 mean: 0.73229, std: 0.00552, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 1, 'countvectorizer__min_df': 1},
 mean: 0.68758, std: 0.00429, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 

In [128]:
print(grid.best_score_)
print(grid.best_params_)

0.748705184291
{'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.5, 'countvectorizer__min_df': 2}


### Predicting on Test Data

In [169]:
test = pd.read_json('../yummly/test.json')

In [170]:
test['ingredients_str'] = test.ingredients.astype(str)

In [171]:
X_test = test.ingredients_str

In [172]:
pred_class_rand = rand.predict(X_test)
pred_class_rand

array([u'british', u'southern_us', u'italian', ..., u'italian',
       u'southern_us', u'mexican'], 
      dtype='<U12')

In [173]:
pred_prob_rand = rand.predict_proba(X_test)
pred_prob_rand.shape

(9944, 20)

#### Kaggle submission

In [174]:
pd.DataFrame({'id':test.id, 'cuisine':pred_class_rand}).set_index('id').to_csv('sub1.csv')