In [16]:
import pandas as pd
import numpy as np
import re, nltk
from nltk.stem import WordNetLemmatizer     
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Final Test

In [17]:
train = pd.read_json('train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [18]:
train["All_of_ingredients"] = train["ingredients"].map(':'. join)
train.head()

Unnamed: 0,cuisine,id,ingredients,All_of_ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce:black olives:grape tomatoes:ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour:ground pepper:salt:tomatoes:ground...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",eggs:pepper:salt:mayonaise:cooking oil:green c...
3,indian,22213,"[water, vegetable oil, wheat, salt]",water:vegetable oil:wheat:salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper:shallots:cornflour:cayenne pepper...


In [19]:
test_data = pd.read_json('test.json')
test_data.head()

Unnamed: 0,id,ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,..."
4,35687,"[ground black pepper, salt, sausage casings, l..."


In [20]:
test_data["All_of_ingredients"] = test_data["ingredients"].map(':'.join)
test_data.head()

Unnamed: 0,id,ingredients,All_of_ingredients
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",baking powder:eggs:all-purpose flour:raisins:m...
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",sugar:egg yolks:corn starch:cream of tartar:ba...
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",sausage links:fennel bulb:fronds:olive oil:cub...
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",meat cuts:file powder:smoked sausage:okra:shri...
4,35687,"[ground black pepper, salt, sausage casings, l...",ground black pepper:salt:sausage casings:leeks...


In [21]:
stemmer = WordNetLemmatizer()

def stem_tokens(tokens, stemmer):
       
    stemming = [stemmer.lemmatize(ingredients) for ingredients in tokens]
    
    return stemming

def tokenizer(words):

    filter_words = re.sub(r'[^a-zA-Z]', " ", words)
    tokens = nltk.word_tokenize(filter_words)
    stems = stem_tokens(tokens, stemmer)
    return stems

In [22]:
le = LabelEncoder()
y = le.fit_transform(train['cuisine'])

In [24]:
logistic = LogisticRegression(C= 10)
mnb = MultinomialNB()
svm = SVC(probability=True, kernel = 'linear', C = 100)
ens = VotingClassifier(estimators=[('lr', logistic), ('mnb', mnb), ('svm', svm)], voting='soft', weights = [3,1,1])

train_clf = Pipeline([
        ('tv' , TfidfVectorizer(analyzer = 'word', tokenizer = tokenizer, 
                                ngram_range=(1,1))),
        ('clf', ens)
    ])


fin_clf = train_clf.fit(train.All_of_ingredients.tolist(), y)

In [25]:
fin_clf.fit(train.All_of_ingredients.tolist(), y)

Pipeline(steps=[('tv', TfidfVectorizer(analyzer='word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
 ...hrinking=True,
  tol=0.001, verbose=False))],
         n_jobs=1, voting='soft', weights=[3, 1, 1]))])

In [26]:
y_pred = fin_clf.predict(test_data.All_of_ingredients.tolist())
pred_cuisine = le.inverse_transform(y_pred)

In [28]:
final_test_id = test_data.id.values
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        
submission = pd.DataFrame({'id' : final_test_id, 'cuisine' : pred_cuisine})
submission.head()
submission.to_csv('submission(repair).csv', index=False)

# Kaggle 제출 결과

* Score : 0.78650