In [7]:
# -*-coding: utf-8-*-
import pandas as pd
import numpy as np
import re, nltk
import matplotlib.pylab as plt
import matplotlib as mpl
import seaborn as sns
from nltk.stem import WordNetLemmatizer     
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


* 데이터 Load

In [8]:
train = pd.read_json('train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [9]:
train['All_of_ingredients'] = train['ingredients'].map(':'.join)
train.head()

Unnamed: 0,cuisine,id,ingredients,All_of_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce:black olives:grape tomatoes:ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour:ground pepper:salt:tomatoes:ground...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs:pepper:salt:mayonaise:cooking oil:green c...
3,indian,22213,"[water, vegetable oil, wheat, salt]",water:vegetable oil:wheat:salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper:shallots:cornflour:cayenne pepper...


In [10]:
le = LabelEncoder()
y = le.fit_transform(train['cuisine'])

# Model improvement & Tuning
 
 * TfidfVectorizer로 하고 앙상블 방법을 적용 시켰을 때 성능이 더 좋음
     * => 전처리 방법은 TfidfVectorizer로 진행, 모델은 앙상블 모델 적용.
 * Logistic C : 정규화 정도를 어느정도 해야 할까?
 * SVM C : 정규화 정도를 어느정도 해야 할까?
 
 * stop_words를 사용할 것인지 안할 것인지?
    

In [11]:
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
       
    stemming = [stemmer.lemmatize(ingredients) for ingredients in tokens]
    
    return stemming

def tokenizer(words):

    filter_words = re.sub(r'[^a-zA-Z]', " ", words)
    tokens = nltk.word_tokenize(filter_words)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [12]:
stop_words= ['brazil', 'best', 'ic', 'kim', 'alum', 'bushi','old', 'pla', 'wax', 'truv', 'tip', 'kha',
           'petit','classic', 'tie', 'mole', 'diet', 'navel','preserv', 'ume', 'soi', 'uncle','bengali','shin',
           'rom', 'southwest', 'jerk','cool','p','minute','added', 'port','rome','french','extra','bloody','black',
           'tokyo','cap','edible', 'winter','kong', 'noir','hoi','texas','wagon','frank','non',
           'farmer','artisan','rock','peasant','el','dutch', 'bragg','romano','cara','blood','rins',
           'nutritional','fast','spanish','ring','sheet','white','season','thai','prime','enriched','helix',
           'activ','wood','lotus','america','pain','concentrate','spare','vre','color','mark','single',
           'hot', 'machine','greek','london','hidden','silver','ra','tot','moisture','tree','snow','m','di','dr','mex',
           'n','seven','balance','cracked','split','hand','yellow','unflavored','asian','shaving','deli','rise','jack',
           'softened','hero','cooking','dri','aka','golden', 'cane','elbow','mo','mi','mr','india','lan','green',
           'imo','runny','navy','orang','ha','leav','eau','smart','well','plain','angled','korean','steel','farm','stock',
           'challenge','baby','lea','long','trumpet','chunk','siu','tap','island','bird','curl','young','shape','pace','napa',
           'believ','fiber','food','vie','haas','eye','te','oz','on','world','dark','summer','or','ruby','a','quick',
           'head','o','jose','cortland','clarified','full','himalayan', 'free','japanese','holy','blue','new','straw','olek','shoot',
           'earth', 'fire','acting','fri','lean','i','celtic','multi','ch','layer','well','gray','won','ngo','le','lb','la','lo',
           'bing','imitation', 'pam', 'one','bibb','rich','cloud','cho','age','cup','kasu','swiss','ready','straight','yu','fu','jimmy'
           'clear','self','aged','mountain','everglades','part','mission','rocket', 'cross','game','lower','devil','b',
           'energy','style','good','hard','paper','brown','diamond','bag', 'bai', 'bar','bob', 'torn','not','mae', 
           'finger', 'submarine', 'chua', 'it', 'in', 'hong', 'hanh','tyson','with','pod', 'pop','angel','four']

In [6]:
logistic = LogisticRegression()
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear')
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('vect1', TfidfVectorizer(analyzer='word', tokenizer=tokenizer)),
        ('clf', ens),
    ])


In [7]:
train_clf.get_params().keys()

['clf__svm__random_state',
 'vect1__decode_error',
 'clf__lr__dual',
 'clf__lr__C',
 'vect1__encoding',
 'clf__lr__solver',
 'clf__svm__C',
 'vect1__max_df',
 'vect1__strip_accents',
 'clf__svm__tol',
 'clf__lr__verbose',
 'clf__svm__kernel',
 'clf__lr__n_jobs',
 'vect1__stop_words',
 'clf__svm__max_iter',
 'vect1__dtype',
 'clf__svm__gamma',
 'clf',
 'clf__svm__coef0',
 'clf__lr__max_iter',
 'vect1__lowercase',
 'clf__weights',
 'vect1__binary',
 'clf__lr__fit_intercept',
 'vect1',
 'clf__lr__warm_start',
 'vect1__input',
 'clf__svm',
 'clf__mnb__fit_prior',
 'clf__voting',
 'vect1__ngram_range',
 'vect1__use_idf',
 'clf__lr__multi_class',
 'clf__lr__penalty',
 'vect1__max_features',
 'clf__lr__intercept_scaling',
 'clf__mnb__alpha',
 'clf__svm__shrinking',
 'vect1__smooth_idf',
 'clf__lr__class_weight',
 'clf__mnb__class_prior',
 'clf__svm__cache_size',
 'vect1__sublinear_tf',
 'vect1__analyzer',
 'clf__lr__random_state',
 'clf__svm__decision_function_shape',
 'vect1__vocabulary',
 '

In [None]:
param_grids = [0.001, 0.01, 0.1, 1, 10, 100]

parameters = {
    'vect1__ngram_range': [(1, 1), (1, 2)],
    'vect1__stop_words': [stop_words, None],
    'clf__svm__C' : param_grids,
    'clf__lr__C' : param_grids}
    
gs_clf = GridSearchCV(train_clf, parameters,  n_jobs=-1)
gs_clf = gs_clf.fit(train.All_of_ingredients.tolist()[:3900], y[:3900])
gs_clf

In [None]:
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
    print("{name}: {best}".format(
        name=param_name, best=best_parameters[param_name]
        ))
print("-" * 30)
print('score :', score)

----

* 원래는 grid search를 이용한 모델 튜닝 Process를 진행하려고 하였지만 여러번 시도해 본 결과 물리적으로 불가능 했음.
    * 좀 더 수준을 올려서 Score를 올릴 계획.
    * 다른 모델도 공부해서 모델 적용 할 계획
    
* 그래서 하나하나씩 여러 경우의 수를 따져서 모델링을 해보고 Score를 측정.

In [13]:
X_train, X_test, y_train, y_test = train_test_split(train.All_of_ingredients.tolist(), y, test_size = 0.2)
y_train

array([ 7, 10,  9, ..., 13,  5, 13], dtype=int64)

In [14]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                stop_words = stop_words, ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train1 = train_clf.fit(X_train, y_train)

In [15]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                stop_words = stop_words, ngram_range=(1,2))),
        ('clf', ens)
    ])


model_train2 = train_clf.fit(X_train, y_train)

In [16]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                 ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train3 = train_clf.fit(X_train, y_train)

In [17]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                 ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train4 = train_clf.fit(X_train, y_train)

In [18]:
logistic = LogisticRegression(C= 0.01)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train5 = train_clf.fit(X_train, y_train)

In [19]:
logistic = LogisticRegression(C= 0.01)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 0.1)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,2))),
        ('clf', ens)
    ])


model_train6 = train_clf.fit(X_train, y_train)

In [20]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                stop_words = stop_words, ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train7 = train_clf.fit(X_train, y_train)

In [21]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train8 = train_clf.fit(X_train, y_train)

In [22]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                stop_words = stop_words, ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train9 = train_clf.fit(X_train, y_train)

In [23]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train10 = train_clf.fit(X_train, y_train)

In [24]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train11 = train_clf.fit(X_train, y_train)

In [25]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                stop_words = stop_words, ngram_range=(2,6))),
        ('clf', ens)
    ])


model_train12 = train_clf.fit(X_train, y_train)

In [38]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,2))),
        ('clf', ens)
    ])


model_train13 = train_clf.fit(X_train, y_train)

In [39]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 10)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(2,3))),
        ('clf', ens)
    ])


model_train14 = train_clf.fit(X_train, y_train)

In [40]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train15 = train_clf.fit(X_train, y_train)

In [41]:
logistic = LogisticRegression(C= 1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,2))),
        ('clf', ens)
    ])


model_train16 = train_clf.fit(X_train, y_train)

In [46]:
logistic = LogisticRegression(C= 0.1)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train17 = train_clf.fit(X_train, y_train)

In [47]:
logistic = LogisticRegression(C= 10)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train18 = train_clf.fit(X_train, y_train)

In [50]:
logistic = LogisticRegression(C= 0.01)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train19 = train_clf.fit(X_train, y_train)

In [51]:
logistic = LogisticRegression(C= 0.001)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train20 = train_clf.fit(X_train, y_train)

In [54]:
logistic = LogisticRegression(C= 100)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


model_train21 = train_clf.fit(X_train, y_train)

In [26]:
model_train1.score(X_test, y_test)

0.732872407291012

In [27]:
model_train2.score(X_test, y_test)

0.67291011942174728

In [28]:
model_train3.score(X_test, y_test)

0.59157762413576365

In [29]:
model_train4.score(X_test, y_test)

0.59597737272155882

In [30]:
model_train5.score(X_test, y_test)

0.72080452545568829

In [31]:
model_train6.score(X_test, y_test)

0.65191703331238215

In [32]:
model_train7.score(X_test, y_test)

0.77272155876807036

In [33]:
model_train8.score(X_test, y_test)

0.77498428661219354

In [34]:
model_train9.score(X_test, y_test)

0.67642991829038346

In [35]:
model_train10.score(X_test, y_test)

0.68698931489629167

In [36]:
model_train11.score(X_test, y_test)

0.6595851665619108

In [37]:
model_train12.score(X_test, y_test)

0.65267127592708984

In [42]:
model_train13.score(X_test, y_test)

0.75876807039597738

In [43]:
model_train14.score(X_test, y_test)

0.70622250157133881

In [44]:
model_train15.score(X_test, y_test)

0.77121307353865498

In [45]:
model_train16.score(X_test, y_test)

0.75826524198617218

In [48]:
model_train17.score(X_test, y_test)

0.72558139534883725

* Best Model(model_train18)

    * Score = 0.78466373350094276

In [49]:
model_train18.score(X_test, y_test)

0.78466373350094276

* -------

In [52]:
model_train19.score(X_test, y_test)

0.70245128849780014

In [53]:
model_train20.score(X_test, y_test)

0.72847265870521682

In [55]:
model_train21.score(X_test, y_test)

0.7808925204274042