# Build Classification Models

In [71]:
# Import previous cleaned data
import pandas as pd
cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [72]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

In [73]:
# Separate X and y as labels and features. Our labels are the "cuisine" column
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [74]:
# For the feature we drop Unamed: 0 and cuisine columns
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [75]:
# Split the data into train and test sets:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [76]:
# On crée une instance de régression logistique, one vs rest avec le modèle liblinear.
lr = OneVsRestClassifier(LogisticRegression(solver='lbfgs'))
# On optimise le modèle pour qu'il fitte nos données d'entraînement
model = lr.fit(X_train, np.ravel(y_train))

# Calcul de l'accuracy
accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

Accuracy is 0.8081734778982486


On calcule l'accuracy : ce n'est pas très pertinent dans ce cas, parce qu'on a balanced les données (inventé des nouvelles pour équilibrer). En plus, on l'a fait avant même de split notre jeu de données, donc on en a possiblement dans le train ET le test set. Donc forcément comme certaines données de test et train ont été générées automatiquement, on va prédire et tester des données qui ont la même source, et retrouver un peu de la logique de la fonction oversample.

Du coup on fait "confiance" à la fonction oversample pour nous donner des données cohérentes, en faisant ça on perd de la contextualisation.

In [77]:
# On va tester le modèle avec la ligne n:
n = 50
print(f'ingredients: {X_test.iloc[n][X_test.iloc[n]!=0].keys()}')
print(f'cuisine: {y_test.iloc[n]}')

ingredients: Index(['cayenne', 'cilantro', 'egg', 'fish', 'scallion', 'wheat'], dtype='object')
cuisine: thai


In [78]:
X_train

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
1604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3663,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2132,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3919,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1791,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
267,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1691,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [79]:
test= X_test.iloc[n].values.reshape(-1, 1).T
proba = model.predict_proba(test)

classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
thai,0.573881
korean,0.301973
chinese,0.105465
japanese,0.011857
indian,0.006823


In [80]:
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

     chinese       0.76      0.66      0.71       241
      indian       0.92      0.93      0.93       241
    japanese       0.73      0.76      0.75       240
      korean       0.84      0.80      0.82       228
        thai       0.79      0.88      0.83       249

    accuracy                           0.81      1199
   macro avg       0.81      0.81      0.81      1199
weighted avg       0.81      0.81      0.81      1199

