There are a lot of parameters that are set by default when working with these classifiers. Intellisense in VS Code can help you dig into them. Adopt one of the ML Classification Techniques in this lesson and retrain models tweaking various parameter values. Build a notebook explaining why some changes help the model quality while others degrade it. Be detailed in your answer.

In [2]:
#importing the necessary libries 
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

#importing the model libries

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score


In [3]:
#reading the dataset
cuisines_df = pd.read_csv("cleaned_cuisines.csv")
cuisines_df.head()

Unnamed: 0.1,Unnamed: 0,cuisine,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,indian,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,indian,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [4]:
#Divide the X and y coordinates into two dataframes for training. cuisine can be the labels dataframe:

cuisines_label_df = cuisines_df['cuisine']
cuisines_label_df.head()

0    indian
1    indian
2    indian
3    indian
4    indian
Name: cuisine, dtype: object

In [5]:
#Drop that Unnamed: 0 column and the cuisine column, calling drop(). Save the rest of the data as trainable features:

cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)
cuisines_feature_df.head()

Unnamed: 0,almond,angelica,anise,anise_seed,apple,apple_brandy,apricot,armagnac,artemisia,artichoke,...,whiskey,white_bread,white_wine,whole_grain_wheat_flour,wine,wood,yam,yeast,yogurt,zucchini
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [16]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [9]:
C = 10
# Create different classifiers.
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(C),
    'SVC':SVC(),
    'RFST':RandomForestClassifier(n_estimators= 100),
    'ADA':AdaBoostClassifier(n_estimators=100)


}

In [12]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print('************************************')
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

************************************
Accuracy (train) for Linear SVC: 78.2% 
              precision    recall  f1-score   support

     chinese       0.66      0.70      0.68       249
      indian       0.90      0.92      0.91       238
    japanese       0.77      0.75      0.76       242
      korean       0.81      0.75      0.78       230
        thai       0.78      0.80      0.79       240

    accuracy                           0.78      1199
   macro avg       0.79      0.78      0.78      1199
weighted avg       0.78      0.78      0.78      1199

************************************
Accuracy (train) for KNN classifier: 72.1% 
              precision    recall  f1-score   support

     chinese       0.70      0.69      0.69       249
      indian       0.87      0.77      0.82       238
    japanese       0.62      0.83      0.71       242
      korean       0.86      0.56      0.68       230
        thai       0.66      0.75      0.70       240

    accuracy               

let select one out of the four algortihms to see the difference when changing some of the parameters

In [13]:
#initial estimator
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(X_train,np.ravel(y_train))
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f'Adaboost_classifier (train): {accuracy * 100}')
print(classification_report(y_test,y_pred))

Adaboost_classifier (train): 70.05838198498749
              precision    recall  f1-score   support

     chinese       0.61      0.45      0.52       249
      indian       0.89      0.84      0.87       238
    japanese       0.63      0.70      0.67       242
      korean       0.63      0.74      0.68       230
        thai       0.75      0.78      0.76       240

    accuracy                           0.70      1199
   macro avg       0.70      0.70      0.70      1199
weighted avg       0.70      0.70      0.70      1199



In [14]:
#initial estimator
classifier = AdaBoostClassifier(n_estimators=200)
classifier.fit(X_train,np.ravel(y_train))
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f'Adaboost_classifier (train): {accuracy * 100}')
print(classification_report(y_test,y_pred))

Adaboost_classifier (train): 60.46705587989991
              precision    recall  f1-score   support

     chinese       0.55      0.27      0.36       249
      indian       0.83      0.85      0.84       238
    japanese       0.51      0.50      0.51       242
      korean       0.49      0.74      0.59       230
        thai       0.66      0.69      0.67       240

    accuracy                           0.60      1199
   macro avg       0.61      0.61      0.59      1199
weighted avg       0.61      0.60      0.59      1199



**estimator**

An estimator is a mathematical model or algorithm that learns from data in order to make predictions or classify new instances in machine learning. In the context of a classifier, an estimator is a specific algorithm or model used for classification tasks.

The classification process begins with training the estimator on a labeled dataset, where each instance is assigned a known class label. The estimator's goal is to learn patterns and relationships in the data that can be used to accurately classify previously unseen instances.

**result interpretation**

we can see from the above result when we try to increase the estimator vaules the accuracy result tends to reduce, which might be because of overfitting. Overfitting occurs when a model becomes too complex and starts to memorize the training data instead of learning general patterns that can be applied to unseen data.

but below are some possible reasons why the result tends to decrease:
Overfitting: As the number of estimators increases, the model becomes more complex and has a higher capacity to fit the training data. If the number of estimators is set too high, the model may start to overfit by focusing too much on the idiosyncrasies of the training data, resulting in poor generalization to new data.

Insufficient regularization: AdaBoost uses a technique called "boosting" to combine multiple weak learners (estimators) into a strong ensemble model. Regularization is important in AdaBoost to prevent overfitting. If the regularization parameters are not properly tuned, increasing the number of estimators can exacerbate overfitting and lead to a decrease in accuracy.

Limited training data: If the training dataset is relatively small, increasing the number of estimators might lead to overfitting. With a limited amount of data, the model may struggle to find meaningful patterns and end up memorizing noise or outliers instead.

Lack of diversity among estimators: AdaBoost relies on combining weak learners that are diverse and complementary. Each estimator should focus on different aspects of the data. If the additional 100 estimators added to the model do not bring sufficient diversity or introduce new insights, the overall performance might not improve and could even degrade.

To address this issue, you can try the following steps:

Regularization:

Cross-validation: 

Feature engineering: 

Increase the training data: 












In [19]:
#let try and increase the training size form 70:30 to 80:20
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.2)


In [20]:
#initial estimator
classifier = AdaBoostClassifier(n_estimators=100)
classifier.fit(X_train,np.ravel(y_train))
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f'Adaboost_classifier (train): {accuracy * 100}')
print(classification_report(y_test,y_pred))

Adaboost_classifier (train): 70.58823529411765
              precision    recall  f1-score   support

     chinese       0.62      0.45      0.52       164
      indian       0.86      0.87      0.87       162
    japanese       0.66      0.60      0.63       162
      korean       0.66      0.81      0.73       159
        thai       0.71      0.80      0.75       152

    accuracy                           0.71       799
   macro avg       0.70      0.71      0.70       799
weighted avg       0.70      0.71      0.70       799



In [21]:
#increasing the estimator
classifier = AdaBoostClassifier(n_estimators=200)
classifier.fit(X_train,np.ravel(y_train))
y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_pred)
print(f'Adaboost_classifier (train): {accuracy * 100}')
print(classification_report(y_test,y_pred))

Adaboost_classifier (train): 66.08260325406758
              precision    recall  f1-score   support

     chinese       0.61      0.44      0.51       164
      indian       0.84      0.79      0.81       162
    japanese       0.60      0.59      0.60       162
      korean       0.65      0.75      0.70       159
        thai       0.61      0.74      0.67       152

    accuracy                           0.66       799
   macro avg       0.66      0.66      0.66       799
weighted avg       0.66      0.66      0.66       799



**NOTE:**

When the training size is increases to 80:20, and 200 estimator was passed the accurarcy increases with 66% instead of the 60% got when using the 70:30 train_test_split ratio. 

so we can say that option for addressing the issue of overfitting in Adaboost_classifier works for us, that is increasing the training sizes.