In [35]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
from sklearn.svm import SVC
import numpy as np

#In this, you will use the dataset you saved from the last lesson full of balanced, clean data all about cuisines.
#You will use this dataset with a variety of classifiers to predict a given national cuisine based on a group of ingredients. 
# While doing so, you'll learn more about some of the ways that algorithms can be leveraged for classification tasks.

cleaned_cuisine = pd.read_csv("C:\\Users\HP\Desktop\\-ArewaDS-Machine-Learning-Assignments-\\Data\\cleaned_cuisines.csv")
cleaned_cuisine.head()

#Divide the X and y coordinates into two dataframes for training. cuisine can be the labels dataframe:

cuisines_label_df = cleaned_cuisine['cuisine']
cuisines_label_df.head()

#Drop that Unnamed: 0 column and the cuisine column, calling drop(). Save the rest of the data as trainable features:

cuisines_feature_df = cleaned_cuisine.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

#NOW YOU ARE READY TO TRAIN YOUR MODEL! CHOOSING YOUR CLASSIFIER...
#Now that your data is clean and ready for training, you have to decide which algorithm to use for the job. Scikit-learn groups 
#classification under Supervised Learning, and in that category you will find many ways to classify. The variety is quite bewildering 
#at first sight. The following methods all include classification techniques:
#Linear Models, Support Vector Machines, Stochastic Gradient Descent, Nearest Neighbors, Gaussian Processes, Decision Trees
# Ensemble methods (voting Classifier), Multiclass and multioutput algorithms (multiclass and multilabel classification, 
# multiclass-multioutput classification).

#You can also use neural networks to classify data, but that is outside the scope of this lesson.

#SPLIT THE DATA...

#We can focus on logistic regression for our first training trial since you recently learned about the latter in a previous lesson.
# Split your data into training and testing groups by calling train_test_split():

X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

#APPLY LOGISTIC REGRESSION

#Since you are using the multiclass case, you need to choose what scheme to use and what solver to set. 
# Use LogisticRegression with a multiclass setting and the liblinear solver to train.
#Create a logistic regression with multi_class set to ovr and the solver set to liblinear:

lr = LogisticRegression(multi_class='ovr',solver='liblinear')
model = lr.fit(X_train, np.ravel(y_train))

accuracy = model.score(X_test, y_test)
print ("Accuracy is {}".format(accuracy))

#Try a different solver like lbfgs, which is often set as default. Note, use Pandas ravel function to flatten your data when needed.
# The accuracy is good at over 80%!

#ou can see this model in action by testing one row of data (#50):

print(f'ingredients: {X_test.iloc[50][X_test.iloc[50]!=0].keys()}')
print(f'cuisine: {y_test.iloc[50]}')

#Try a different row number and check the results

#Digging deeper, you can check for the accuracy of this prediction:

test= X_test.iloc[50].values.reshape(-1, 1).T
proba = model.predict_proba(test)
classes = model.classes_
resultdf = pd.DataFrame(data=proba, columns=classes)

topPrediction = resultdf.T.sort_values(by=[0], ascending = [False])
topPrediction.head()

#Can you explain why the model is pretty sure this is an Indian cuisine?
# Get more detail by printing a classification report, as you did in the regression lessons:

y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))



Accuracy is 0.786488740617181
ingredients: Index(['coconut', 'coriander', 'cumin', 'fenugreek', 'fish', 'pepper',
       'shrimp', 'turmeric', 'vegetable_oil'],
      dtype='object')
cuisine: thai
              precision    recall  f1-score   support

     chinese       0.75      0.67      0.71       240
      indian       0.90      0.89      0.90       243
    japanese       0.74      0.75      0.74       255
      korean       0.79      0.79      0.79       239
        thai       0.74      0.85      0.79       222

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1199
weighted avg       0.79      0.79      0.79      1199



