In [17]:
# -*-coding: utf-8-*-
import pandas as pd
import numpy as np
import re, nltk
from nltk.stem import WordNetLemmatizer     
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split, StratifiedKFold
from sklearn.cross_validation import cross_val_score
from scipy.stats import sem

* train 데이터 불러오기

In [2]:
train = pd.read_json('train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


* Ingredients ':' 단위로 나눠 주기 

In [9]:
train['All_of_ingredients'] = train['ingredients'].map(':'.join)
train.head()

Unnamed: 0,cuisine,id,ingredients,All_of_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce:black olives:grape tomatoes:ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour:ground pepper:salt:tomatoes:ground...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs:pepper:salt:mayonaise:cooking oil:green c...
3,indian,22213,"[water, vegetable oil, wheat, salt]",water:vegetable oil:wheat:salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper:shallots:cornflour:cayenne pepper...


## Ingredients 전처리

* target : cuisine => LabelEncoder로 전처리
* Ingredients 중에 상표가 포함되어 있고 글자가 깨져 있는 것 => Tokenizer로 걸러주기
* Ingredients 중에 tomatoes, tomato 처럼 같은 재료이지만 단수 복수 형태로 나뉘어 있음
    => WordNetLemmatizer로 구분 짓게 안되게 만들기
    
## CountVectorizer, TfidfVectorizer로 전처리 성능 비교    
1. CountVectorizer로 Preprocessing  

2. TfidfVectorizer로 Preprocessing


## Tokenizer 만들기

In [4]:
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
    stemming = [stemmer.lemmatize(ingredients) for ingredients in tokens]
    return stemming

def tokenizer(words):
    filter_words = re.sub(r'[^a-zA-Z]', " ", words)
    tokens = nltk.word_tokenize(filter_words)
    stems = stem_tokens(tokens, stemmer)
    return stems

* Ingredients 코퍼스를 만들고 탐색 후 의미 없다고 생각한 단어들 stop_words로 생성


In [5]:
stop_words= ['brazil', 'best', 'ic', 'kim', 'alum', 'bushi','old', 'pla', 'wax', 'truv', 'tip', 'kha',
           'petit','classic', 'tie', 'mole', 'diet', 'navel','preserv', 'ume', 'soi', 'uncle','bengali','shin',
           'rom', 'southwest', 'jerk','cool','p','minute','added', 'port','rome','french','extra','bloody','black',
           'tokyo','cap','edible', 'winter','kong', 'noir','hoi','texas','wagon','frank','non',
           'farmer','artisan','rock','peasant','el','dutch', 'bragg','romano','cara','blood','rins',
           'nutritional','fast','spanish','ring','sheet','white','season','thai','prime','enriched','helix',
           'activ','wood','lotus','america','pain','concentrate','spare','vre','color','mark','single',
           'hot', 'machine','greek','london','hidden','silver','ra','tot','moisture','tree','snow','m','di','dr','mex',
           'n','seven','balance','cracked','split','hand','yellow','unflavored','asian','shaving','deli','rise','jack',
           'softened','hero','cooking','dri','aka','golden', 'cane','elbow','mo','mi','mr','india','lan','green',
           'imo','runny','navy','orang','ha','leav','eau','smart','well','plain','angled','korean','steel','farm','stock',
           'challenge','baby','lea','long','trumpet','chunk','siu','tap','island','bird','curl','young','shape','pace','napa',
           'believ','fiber','food','vie','haas','eye','te','oz','on','world','dark','summer','or','ruby','a','quick',
           'head','o','jose','cortland','clarified','full','himalayan', 'free','japanese','holy','blue','new','straw','olek','shoot',
           'earth', 'fire','acting','fri','lean','i','celtic','multi','ch','layer','well','gray','won','ngo','le','lb','la','lo',
           'bing','imitation', 'pam', 'one','bibb','rich','cloud','cho','age','cup','kasu','swiss','ready','straight','yu','fu','jimmy'
           'clear','self','aged','mountain','everglades','part','mission','rocket', 'cross','game','lower','devil','b',
           'energy','style','good','hard','paper','brown','diamond','bag', 'bai', 'bar','bob', 'torn','not','mae', 
           'finger', 'submarine', 'chua', 'it', 'in', 'hong', 'hanh','tyson','with','pod', 'pop','angel','four']

## CountVectorizer 사용

* tokenizer 함수 적용
* target = LabelEncoder 전처리

In [10]:
cv = CountVectorizer(analyzer= 'word', tokenizer = tokenizer)
ingredients_corpus = cv.fit_transform(train.All_of_ingredients.tolist())
ingredients_corpus.shape 
# CountVectorizer로 진행하고 tokenizer 함수를 적용 시켰을 때 재료 수 : 2783개

(39774, 2783)

In [11]:
# Ingredients Corpus Shape

print cv.vocabulary_.keys()[:100]

[u'unflavored', u'mackerel', u'yellow', u'sichuan', u'negi', u'clotted', u'asian', u'bucatini', u'hyssop', u'pancetta', u'shaving', u'manis', u'deli', u'rise', u'gremolata', u'jack', u'softened', u'icing', u'four', u'baking', u'broiler', u'wholemeal', u'acinus', u'chambord', u'frozen', u'moulard', u'cholesterol', u'poppy', u'uncooked', u'muscovy', u'orzo', u'jamaica', u'speck', u'clover', u'ravva', u'almond', u'bacon', u'japanese', u'millet', u'brill', u'soppressata', u'chee', u'cactus', u'blue', u'fontina', u'mascarpone', u'tipo', u'cooking', u'togarashi', u'salsify', u'galangal', u'new', u'zesty', u'crunch', u'hero', u'devein', u'kefalotyri', u'herb', u'jasmine', u'broccolini', u'marnier', u'floret', u'jamon', u'textured', u'active', u'johnsonville', u'mozzarella', u'dry', u'tumeric', u'lychee', u'stevia', u'dri', u'mezzetta', u'chopmeat', u'smoke', u'bertolli', u'aka', u'pack', u'golden', u'canola', u'straw', u'sliced', u'dungeness', u'cane', u'dumpling', u'soybean', u'shoyu', u'est

In [12]:
le = LabelEncoder()
y = le.fit_transform(train['cuisine'])
X_1 = ingredients_corpus 
#CountVectorizer, TfidfVectorizer 구분 지어주기 위해 X_1, X_2로 지정

* Cross Validation 하기 위해 train, test로 Spliting

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_1, y, train_size = 0.75, 
                                                    random_state = 30)

## Model 적용

In [14]:
logistic = LogisticRegression()
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear')
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

model1 = logistic.fit(X_train, y_train)
model2 = mnb.fit(X_train, y_train)
model3 = svm.fit(X_train, y_train)
model4 = ens.fit(X_train, y_train)

In [24]:
# 알파벳 순으로 cuisine Sort, target_names 설정, LabelEncoder 상에는 0~19까지 알파벳 순으로 되어 있으므로
cuisines = train.loc[train['cuisine'].str.lower().order().index]
target_names = cuisines['cuisine'].unique()

  from ipykernel import kernelapp as app


In [25]:
# logistic Report
y_pred = model1.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.67      0.46      0.55       106
     british       0.55      0.44      0.49       207
cajun_creole       0.77      0.68      0.72       386
     chinese       0.78      0.88      0.83       613
    filipino       0.78      0.64      0.70       198
      french       0.64      0.61      0.62       666
       greek       0.80      0.72      0.76       290
      indian       0.87      0.89      0.88       765
       irish       0.65      0.46      0.54       189
     italian       0.80      0.90      0.85      1914
    jamaican       0.86      0.72      0.78       134
    japanese       0.87      0.66      0.75       362
      korean       0.82      0.77      0.79       213
     mexican       0.89      0.93      0.91      1625
    moroccan       0.83      0.76      0.79       212
     russian       0.60      0.39      0.48       132
 southern_us       0.68      0.81      0.74      1084
     spanish       0.70    

In [26]:
# MultinomialNB Report
y_pred = model2.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.50      0.27      0.35       106
     british       0.30      0.42      0.35       207
cajun_creole       0.54      0.74      0.62       386
     chinese       0.70      0.89      0.78       613
    filipino       0.71      0.46      0.56       198
      french       0.56      0.53      0.55       666
       greek       0.67      0.60      0.63       290
      indian       0.85      0.87      0.86       765
       irish       0.63      0.35      0.45       189
     italian       0.84      0.80      0.82      1914
    jamaican       0.84      0.57      0.68       134
    japanese       0.85      0.56      0.68       362
      korean       0.81      0.70      0.75       213
     mexican       0.90      0.88      0.89      1625
    moroccan       0.74      0.75      0.74       212
     russian       0.54      0.23      0.33       132
 southern_us       0.56      0.69      0.62      1084
     spanish       0.53    

In [27]:
# SVM Report
y_pred = model3.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.54      0.52      0.53       106
     british       0.48      0.52      0.50       207
cajun_creole       0.74      0.71      0.72       386
     chinese       0.74      0.85      0.79       613
    filipino       0.67      0.63      0.65       198
      french       0.59      0.62      0.61       666
       greek       0.74      0.70      0.72       290
      indian       0.87      0.87      0.87       765
       irish       0.58      0.49      0.53       189
     italian       0.81      0.87      0.84      1914
    jamaican       0.80      0.67      0.73       134
    japanese       0.80      0.68      0.73       362
      korean       0.83      0.75      0.79       213
     mexican       0.90      0.89      0.90      1625
    moroccan       0.81      0.78      0.80       212
     russian       0.54      0.37      0.44       132
 southern_us       0.72      0.77      0.75      1084
     spanish       0.66    

In [28]:
# VotingClassifier Report
y_pred = model4.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.65      0.43      0.52       106
     british       0.49      0.45      0.47       207
cajun_creole       0.75      0.70      0.73       386
     chinese       0.76      0.89      0.82       613
    filipino       0.81      0.62      0.70       198
      french       0.63      0.60      0.62       666
       greek       0.80      0.69      0.74       290
      indian       0.88      0.90      0.89       765
       irish       0.71      0.45      0.55       189
     italian       0.82      0.90      0.86      1914
    jamaican       0.90      0.69      0.78       134
    japanese       0.87      0.66      0.75       362
      korean       0.84      0.76      0.80       213
     mexican       0.90      0.92      0.91      1625
    moroccan       0.80      0.75      0.77       212
     russian       0.65      0.34      0.45       132
 southern_us       0.66      0.81      0.73      1084
     spanish       0.70    

## CountVectorizer로 전처리 한 후 Cross Validation(StratifiedKFold 방법으로 했을 때 각 모델별 Accuarcy Score)

* 여기서 score는 Accuarcy를 의미

In [18]:
# stratified K-Fold 5번
for i, clf in enumerate([logistic, mnb, svm, ens]):
    scores = cross_val_score(clf, X_1, y, cv=5)
    print(("Model {0:d}: Mean score: {1:.3f} (+/-{2:.3f})").format(i, np.mean(scores), sem(scores)))

Model 0: Mean score: 0.785 (+/-0.002)
Model 1: Mean score: 0.722 (+/-0.002)
Model 2: Mean score: 0.771 (+/-0.001)
Model 3: Mean score: 0.786 (+/-0.002)


# TfidfVectorizer 사용

* tokenizer 함수 적용
* target = LabelEncoder 전처리

In [19]:
tv = TfidfVectorizer(analyzer= 'word', tokenizer = tokenizer)
ingredients_corpus = tv.fit_transform(train.All_of_ingredients.tolist())

In [20]:
le = LabelEncoder() 
y = le.fit_transform(train['cuisine'])
X_2 = ingredients_corpus

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X_2, y, train_size = 0.75, 
                                                   random_state = 30)

In [32]:
logistic = LogisticRegression()
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear')
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

model5 = logistic.fit(X_train, y_train)
model6 = mnb.fit(X_train, y_train)
model7 = svm.fit(X_train, y_train)
model8 = ens.fit(X_train, y_train)

In [33]:
# logistic Report
y_pred = model5.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.61      0.35      0.44       106
     british       0.58      0.35      0.44       207
cajun_creole       0.78      0.67      0.72       386
     chinese       0.74      0.89      0.81       613
    filipino       0.78      0.56      0.65       198
      french       0.60      0.59      0.59       666
       greek       0.85      0.64      0.73       290
      indian       0.86      0.90      0.88       765
       irish       0.73      0.40      0.52       189
     italian       0.76      0.91      0.83      1914
    jamaican       0.91      0.64      0.75       134
    japanese       0.89      0.62      0.73       362
      korean       0.83      0.73      0.78       213
     mexican       0.88      0.93      0.91      1625
    moroccan       0.86      0.72      0.78       212
     russian       0.88      0.28      0.43       132
 southern_us       0.66      0.82      0.73      1084
     spanish       0.70    

In [34]:
# MultinomialNB Report
y_pred = model6.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.83      0.14      0.24       106
     british       0.75      0.06      0.11       207
cajun_creole       0.78      0.51      0.62       386
     chinese       0.55      0.93      0.70       613
    filipino       0.95      0.10      0.17       198
      french       0.56      0.40      0.47       666
       greek       0.89      0.30      0.45       290
      indian       0.79      0.91      0.84       765
       irish       1.00      0.05      0.10       189
     italian       0.64      0.91      0.75      1914
    jamaican       1.00      0.16      0.27       134
    japanese       0.94      0.50      0.65       362
      korean       0.99      0.32      0.49       213
     mexican       0.80      0.92      0.86      1625
    moroccan       0.96      0.34      0.50       212
     russian       1.00      0.02      0.03       132
 southern_us       0.49      0.77      0.60      1084
     spanish       0.96    

In [35]:
# SVM Report
y_pred = model7.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.63      0.48      0.55       106
     british       0.54      0.47      0.51       207
cajun_creole       0.77      0.70      0.73       386
     chinese       0.76      0.88      0.82       613
    filipino       0.75      0.64      0.69       198
      french       0.61      0.63      0.62       666
       greek       0.82      0.71      0.76       290
      indian       0.87      0.89      0.88       765
       irish       0.68      0.45      0.54       189
     italian       0.79      0.90      0.84      1914
    jamaican       0.91      0.68      0.78       134
    japanese       0.86      0.64      0.74       362
      korean       0.85      0.77      0.81       213
     mexican       0.90      0.91      0.91      1625
    moroccan       0.85      0.75      0.80       212
     russian       0.64      0.33      0.43       132
 southern_us       0.70      0.80      0.75      1084
     spanish       0.73    

In [36]:
# VotingClassifier Report
y_pred = model8.predict(X_test)
print classification_report(y_test, y_pred, target_names = target_names)

              precision    recall  f1-score   support

   brazilian       0.73      0.41      0.52       106
     british       0.63      0.34      0.44       207
cajun_creole       0.79      0.65      0.72       386
     chinese       0.74      0.91      0.81       613
    filipino       0.82      0.54      0.65       198
      french       0.63      0.60      0.62       666
       greek       0.85      0.62      0.72       290
      indian       0.86      0.90      0.88       765
       irish       0.77      0.40      0.52       189
     italian       0.76      0.92      0.83      1914
    jamaican       0.92      0.65      0.76       134
    japanese       0.89      0.64      0.74       362
      korean       0.86      0.72      0.78       213
     mexican       0.88      0.93      0.91      1625
    moroccan       0.88      0.73      0.79       212
     russian       0.84      0.28      0.42       132
 southern_us       0.64      0.82      0.72      1084
     spanish       0.76    

## TfidfVectorizer로 전처리 한 후 Cross Validation(StratifiedKFold 방법으로 했을 때 각 모델별 Accuarcy Score)

* 여기서 Score는 Accuracy를 의미

In [22]:
for i, clf in enumerate([logistic, mnb, svm, ens]):
    scores = cross_val_score(clf, X_2, y, cv=5)
    print(("Model {0:d}: Mean score: {1:.3f} (+/-{2:.3f})").format(i, np.mean(scores), sem(scores)))

Model 0: Mean score: 0.776 (+/-0.001)
Model 1: Mean score: 0.672 (+/-0.003)
Model 2: Mean score: 0.786 (+/-0.002)
Model 3: Mean score: 0.779 (+/-0.002)
