In [57]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score,precision_score,confusion_matrix,classification_report, precision_recall_curve
import numpy as np
import pandas as pd
from time import time

In [58]:
cuisines_df = pd.read_csv("../data/cleaned_cuisines.csv")

In [59]:
cuisines_label_df = cuisines_df['cuisine']
cuisines_feature_df = cuisines_df.drop(['Unnamed: 0', 'cuisine'], axis=1)

In [60]:
X_train, X_test, y_train, y_test = train_test_split(cuisines_feature_df, cuisines_label_df, test_size=0.3)

In [71]:
C = 10
# Create different classifiers.
classifiers = {
    'LinearSVC()': LinearSVC(),
    'Linear SVC': SVC(kernel='linear', C=C, probability=True,random_state=0),
    'KNN classifier': KNeighborsClassifier(),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

Base scores from the lesson:

In [72]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, np.ravel(y_train))

    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy (train) for %s: %0.1f%% " % (name, accuracy * 100))
    print(classification_report(y_test,y_pred))

Accuracy (train) for LinearSVC(): 79.6% 
              precision    recall  f1-score   support

     chinese       0.77      0.71      0.74       268
      indian       0.91      0.91      0.91       223
    japanese       0.71      0.78      0.75       232
      korean       0.83      0.73      0.78       244
        thai       0.77      0.86      0.81       232

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199

Accuracy (train) for Linear SVC: 79.1% 
              precision    recall  f1-score   support

     chinese       0.73      0.72      0.72       268
      indian       0.86      0.92      0.89       223
    japanese       0.77      0.79      0.78       232
      korean       0.83      0.71      0.77       244
        thai       0.78      0.83      0.80       232

    accuracy                           0.79      1199
   macro avg       0.79      0.79      0.79      1

In [73]:
X_train.shape

(2796, 380)

As we have n_samples > n_features we set dual to False. The combination of penalty='l2' and loss='hinge' are not supported when dual=False, so we only have with the default squared_hinge option for 'loss' parameter. 

In [74]:
from sklearn.model_selection import GridSearchCV
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

parameters = [
  {
  'penalty': ['l1', 'l2'], 
  'dual': [False], 
  'C': [1.5, 2, 3], 
  'tol': [0.125, 0.122, 0.12, 0.115, 0.11, 0.1],
  'intercept_scaling': [0.5, 1, 1.5, 2],
  'class_weight': [None, 'balanced']
  },
]

lin_svc = LinearSVC()
clf = GridSearchCV(lin_svc, parameters, scoring='accuracy', verbose=1)

clf.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for LinearSVC: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

Fitting 5 folds for each of 288 candidates, totalling 1440 fits
Accuracy (train) for LinearSVC: 79.9% 
              precision    recall  f1-score   support

     chinese       0.77      0.71      0.74       268
      indian       0.94      0.90      0.92       223
    japanese       0.72      0.79      0.75       232
      korean       0.82      0.74      0.78       244
        thai       0.76      0.88      0.82       232

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199



In [75]:
best_parameters = clf.best_estimator_.get_params()

for param_name in sorted(parameters[0].keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))


	C: 2
	class_weight: None
	dual: False
	intercept_scaling: 0.5
	penalty: 'l2'
	tol: 0.11


In [76]:
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import RandomizedSearchCV

linear = LinearSVC()
distributions = dict(C=loguniform(0.0001,100), intercept_scaling=loguniform(0.0001,100), tol=loguniform(0.0001,100))
clfRand = RandomizedSearchCV(linear, distributions, random_state=0, n_iter=100)

clfRand.fit(X_train, np.ravel(y_train))

y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for LinearSVC with randomized search: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

best_parameters = clfRand.best_estimator_.get_params()

Accuracy (train) for LinearSVC with randomized search: 79.9% 
              precision    recall  f1-score   support

     chinese       0.77      0.71      0.74       268
      indian       0.94      0.90      0.92       223
    japanese       0.72      0.79      0.75       232
      korean       0.82      0.74      0.78       244
        thai       0.76      0.88      0.82       232

    accuracy                           0.80      1199
   macro avg       0.80      0.80      0.80      1199
weighted avg       0.80      0.80      0.80      1199



In [77]:
for param_name in sorted(distributions.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

	C: 0.09646020776080408
	intercept_scaling: 0.6868076096605975
	tol: 0.01627449933008123


The Randomized search didn't give significantly different results from the exhaustive grid search for LinearSVC(), in fact tweaking the parameters didn't any significant changes in the performance of the model in comparison to the default model.

In [68]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier()

param_dist = {'n_neighbors': np.arange(1, 50), 
              'weights': ['uniform', 'distance'],
              'leaf_size': np.arange(1, 60),
              'p': [1, 2],
            }

n_iter_search = 200

randomKNN = RandomizedSearchCV(neigh, param_distributions=param_dist, n_iter = n_iter_search)

def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})"
                  .format(results['mean_test_score'][candidate],
                          results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")
            
start = time()
randomKNN.fit(X_train, np.ravel(y_train))
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(randomKNN.cv_results_)

RandomizedSearchCV took 231.15 seconds for 200 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 1, 'leaf_size': 31}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 1, 'leaf_size': 54}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 1, 'leaf_size': 16}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 1, 'leaf_size': 29}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'distance', 'p': 1, 'n_neighbors': 1, 'leaf_size': 37}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weights': 'uniform', 'p': 1, 'n_neighbors': 1, 'leaf_size': 4}

Model with rank: 1
Mean validation score: 0.784 (std: 0.008)
Parameters: {'weigh

In [69]:
y_pred = randomKNN.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for RandomKNN with randomized search: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

best_parameters = randomKNN.best_estimator_.get_params()

Accuracy (train) for RandomKNN with randomized search: 75.8% 
              precision    recall  f1-score   support

     chinese       0.73      0.71      0.72       268
      indian       0.86      0.86      0.86       223
    japanese       0.72      0.82      0.77       232
      korean       0.80      0.58      0.67       244
        thai       0.72      0.84      0.77       232

    accuracy                           0.76      1199
   macro avg       0.76      0.76      0.76      1199
weighted avg       0.76      0.76      0.76      1199



We got a 4.7% accuracy and 5% f1-score improve by fine tuning parameters of a KNN classifier: 

Parameters of a default KNeighborsClassifier model:
n_neighbors: 5
weights: uniform
leaf_size: 30
p: 2

Parameters of a KNeighborsClassifier fine tuned by RandomizedSearchCV:
n_neighbors: 4
weights: distance
leaf_size: 49, 32, 17 #supposudely is not the most important parameter as the best models can have different leaf_sizes
p: 1

the most important of which can be considered 'weights', as it is alone gives a significant accuracy improve:

In [70]:
classifier = KNeighborsClassifier(weights='distance')

classifier.fit(X_train, np.ravel(y_train))

y_pred = classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (train) for KNeighborsClassifier with distance weights: %0.1f%% " % (accuracy * 100))
print(classification_report(y_test,y_pred))

Accuracy (train) for KNeighborsClassifier with distance weights: 73.6% 
              precision    recall  f1-score   support

     chinese       0.70      0.68      0.69       268
      indian       0.84      0.86      0.85       223
    japanese       0.60      0.86      0.71       232
      korean       0.86      0.60      0.71       244
        thai       0.80      0.71      0.75       232

    accuracy                           0.74      1199
   macro avg       0.76      0.74      0.74      1199
weighted avg       0.76      0.74      0.74      1199

