In [23]:
import pandas as pd
cuisines_df = pd.read_csv('cleaned_cuisines.csv')
cuisines_df.head()

Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,indian,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,indian,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

Divide features and label

In [25]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [26]:
cuisines_feature_df = cuisines_df.drop(['cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


### Choose classifier

Reasoning:
- Neural networks are too heavy. Given our clean, but minimal dataset, and the fact that we are running training locally via notebooks, neural networks are too heavyweight for this task.
- No two-class classifier. We do not use a two-class classifier, so that rules out one-vs-all.
- Decision tree or logistic regression could work. A decision tree might work, or logistic regression for multiclass data.
- Multiclass Boosted Decision Trees solve a different problem. The multiclass boosted decision tree is most suitable for nonparametric tasks, e.g. tasks designed to build rankings, so it is not useful for us.

### Split the data

In [27]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

### Logistic Regression

In [28]:
# Create a logistic regression with multi_class set to ovr and the solver set to liblinear:

lr = LogisticRegression(multi_class='ovr', solver='liblinear')
# lr = LogisticRegression(multi_class='ovr', solver='lbfgs')

model = lr.fit(X_train,np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print(f"Accuracy is: {accuracy}")

Accuracy is: 0.8131776480400333


 test one row of data


In [29]:
print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

ingredients: Index(['cashew', 'cayenne', 'coriander', 'cumin', 'fenugreek', 'lime_juice',
       'mustard', 'pepper', 'tamarind', 'turmeric', 'vegetable_oil', 'yogurt'],
      dtype='object')
cuisine: indian


In [33]:
# check for the accuracy of this prediction
test= X_test.iloc[50].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()



Unnamed: 0,0
indian,0.974965
thai,0.023593
japanese,0.000647
chinese,0.000542
korean,0.000253


print a classification report


In [31]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     chinese       0.76      0.69      0.73       234
      indian       0.90      0.91      0.91       239
    japanese       0.77      0.78      0.78       241
      korean       0.82      0.81      0.82       254
        thai       0.79      0.87      0.83       231

    accuracy                           0.81      1199
   macro avg       0.81      0.81      0.81      1199
weighted avg       0.81      0.81      0.81      1199

