### Import libraries.

In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.svm import SVC

### Load the data.

In [65]:
df = pd.read_csv('../data/cuisines.csv')
df.head()

Unnamed: 0,id,cuisine,ingredients,no_ingredients,ingredients_str
0,10259,greek,"['romaine lettuce', 'black olives', 'grape tom...",9,"romaine lettuce, black olives, grape tomatoes,..."
1,25693,southern_us,"['plain flour', 'ground pepper', 'salt', 'toma...",11,"plain flour, ground pepper, salt, tomatoes, gr..."
2,20130,filipino,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...",12,"eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,indian,"['water', 'vegetable oil', 'wheat', 'salt']",4,"water, vegetable oil, wheat, salt"
4,13162,indian,"['black pepper', 'shallots', 'cornflour', 'cay...",20,"black pepper, shallots, cornflour, cayenne pep..."


### Define the class for each cuisine.

In [66]:
classes = { 
            'moroccan':1, 'indian':2, 'vietnamese':3, 
            'cajun_creole':4, 'thai':5, 'jamaican':6,
            'chinese':7, 'korean':8, 'mexican':9, 'spanish':10, 
            'russian':11, 'greek':12, 'filipino':13, 'italian':14, 
            'french':15, 'japanese':16, 'british':17, 
            'southern_us':18, 'brazilian':19, 'irish':20
            }

### Save original dataframe.

In [67]:
df_original = df

### Replace names of the cuisines with categorical values, 1-20.

In [68]:
df['cuisine'] = df['cuisine'].map(classes)
df.head()

Unnamed: 0,id,cuisine,ingredients,no_ingredients,ingredients_str
0,10259,12,"['romaine lettuce', 'black olives', 'grape tom...",9,"romaine lettuce, black olives, grape tomatoes,..."
1,25693,18,"['plain flour', 'ground pepper', 'salt', 'toma...",11,"plain flour, ground pepper, salt, tomatoes, gr..."
2,20130,13,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...",12,"eggs, pepper, salt, mayonaise, cooking oil, gr..."
3,22213,2,"['water', 'vegetable oil', 'wheat', 'salt']",4,"water, vegetable oil, wheat, salt"
4,13162,2,"['black pepper', 'shallots', 'cornflour', 'cay...",20,"black pepper, shallots, cornflour, cayenne pep..."


### Prepare X (feature) and y (target).

In [69]:
X = df['ingredients_str']
y = df['cuisine']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [71]:
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

### Multinomial Naive Bayse with TFIFVectorizer.

In [72]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('nb', MultinomialNB())
])

In [73]:
pipe_params = {'tvec__max_features':[4000, 5000, 6000],
               'tvec__min_df':[2, 3, 4],
               'tvec__max_df':[.85, .90],
               'tvec__ngram_range':[(1,1), (1,2)]
    
}

In [74]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=cv_folds, verbose=1,
                  n_jobs=4)

In [75]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.1s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:   37.5s finished


CPU times: user 2.37 s, sys: 630 ms, total: 3 s
Wall time: 38 s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('tvec', TfidfVectorizer()),
                                       ('nb', MultinomialNB())]),
             n_jobs=4,
             param_grid={'tvec__max_df': [0.85, 0.9],
                         'tvec__max_features': [4000, 5000, 6000],
                         'tvec__min_df': [2, 3, 4],
                         'tvec__ngram_range': [(1, 1), (1, 2)]},
             verbose=1)

In [76]:
gs.best_params_

{'tvec__max_df': 0.85,
 'tvec__max_features': 4000,
 'tvec__min_df': 4,
 'tvec__ngram_range': (1, 1)}

In [77]:
preds = gs.predict(X_test)

In [78]:
accuracy = metrics.accuracy_score(y_test, preds)
f1_score = metrics.f1_score(y_test, preds, average='weighted')
accuracy, f1_score

(0.6858407079646017, 0.6540100304090994)

### SVC with TfidfVectorizer.

In [79]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('svc', SVC())
])

In [80]:
pipe_params = {'tvec__max_features':[5000, 6000],
               'tvec__min_df':[2, 3],
               'tvec__max_df':[.85, .90],
               'tvec__ngram_range':[(1,1), (1,2)],
               'svc__C' : [1, 2],
               'svc__kernel' : ['rbf', 'poly'],
               'svc__max_iter' : [2000, 5000]
}

In [81]:
gs = GridSearchCV(pipe,
                  pipe_params,
                  cv=cv_folds, verbose=1,
                  n_jobs=4)

In [None]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 128 candidates, totalling 640 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
gs.best_params_