In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score

from sklearn.metrics import confusion_matrix

from sklearn.metrics import classification_report

In [None]:
# Importing the method needed to apply SVM

from sklearn.svm import SVC

## SVM applied to the Default dataset

### A polynomial kernel

__Note from the scikit-learn documentation:__

"Support Vector Machine algorithms are not scale invariant, so it is highly recommended to scale your data."

In [None]:
Default_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\Default.csv')

In [None]:
Default_df_dummies= pd.get_dummies(Default_df,columns=['student'], drop_first=True)

In [None]:
X_train_def, X_test_def, y_train_def, y_test_def= train_test_split (Default_df_dummies.iloc[:,1:], Default_df_dummies['default'], test_size=0.2, random_state=1)

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
pipe_default_CVSearch = make_pipeline(StandardScaler(), SVC())

In [None]:
SVC().get_params().keys()

__The hyperparameter C__

Fragment from the ISL book:

"As C increases, the method becomes more tolerant of violations to the margin (the margin is larger) and there are more misclassifications of the training data."

"C=0 amounts to not allowing any misclassification of the trainig data, which is only possible when the classes are separable"

Conversely, as C decreases, the method becomes less tolerant of violations to the margin (the margin narrows) and there are less misclassifications of the training data (but POTENTIALLY MORE OVERFITTING and more misclassifications on the test data !!!)

__The hyperparameter C in scikit learn__: In scikit learn, what they call C is actually 1/(the real C) (confusing, I know !!!)

When you read the interpretation of C in the scikit learn documentation, the relationships are stated differently. For example, they would say:

"_As C increases, there are LESS misclassifications of the training data_" (because in scikit learn, C is actually 1/ the real C)

"The C parameter trades off correct classification of training examples against maximization of the decision function’s margin. For larger values of C, a smaller margin will be accepted if the decision function is better at classifying all training points correctly. A lower C will encourage a larger margin, therefore a simpler decision function, at the cost of training accuracy. In other words C behaves as a regularization parameter in the SVM."

From the scikit-learn documentation:

"C is 1 by default and it’s a reasonable default choice. If you have a lot of noisy observations you should decrease it: decreasing C corresponds to more regularization."

__Note__: When fitting a polynomial kernel, we could also tune another hyperparameter called gamma. We won't tune for two reasons:

a) It is not fundamental to do so with a polynomial: you could try different values of gamma and see what happens, but it not fundamental that you do so. We can always use the default value of gamma.

b) Tuning both C and gamma is VERY computationally intense... So need to do tune them both if it is not fundamental

Tuning gamma when a radial basis kernel is used is more fundamental (more advisable to do) than doing it with a poynomial kernel.

In [None]:
# Real C (C from the book) (I limited it to only a few values because the computation is very intense)

real_c= np.array ([1/100, 1/2, 1, 5, 10, 100])

In [None]:
real_c

In [None]:
c_hyperparameter= 1/real_c

In [None]:
c_hyperparameter

In [None]:
hyperparam_grid_poly = {
    'svc__C': c_hyperparameter,
    'svc__class_weight': ['balanced'],
    'svc__degree':[1,2,3,4], # degree 1 correspond to the linear kernel
    'svc__kernel':['poly']
}

In [None]:
grid_search_default= GridSearchCV(estimator= pipe_default_CVSearch, param_grid=hyperparam_grid_poly, cv=5, scoring='accuracy')

In [None]:
grid_search_default.fit(X_train_def, y_train_def)

In [None]:
grid_search_default.best_params_

In [None]:
pipe_default_poly_svc = make_pipeline(StandardScaler(), SVC(kernel='poly', degree=3, C=0.01, class_weight='balanced'))

In [None]:
pipe_default_poly_svc.fit (X_train_def, y_train_def)

In [None]:
y_predicted_test_default1= pipe_default_poly_svc.predict(X_test_def)

In [None]:
confusion_matrix (y_test_def, y_predicted_test_default1)

In [None]:
print (classification_report (y_test_def, y_predicted_test_default1))

__Choosing the hyperparameters based on f1_score__

In [None]:
from sklearn.metrics import f1_score

from sklearn.metrics import make_scorer

In [None]:
f1_scorer = make_scorer(f1_score, pos_label='Yes')

In [None]:
grid_search_default_f1score= GridSearchCV(estimator= pipe_default_CVSearch, param_grid=hyperparam_grid_default, cv=5, scoring=f1_scorer)

In [None]:
grid_search_default_f1score.fit(X_train_def, y_train_def)

In [None]:
grid_search_default_f1score.best_params_

Same hyperparameter values we got when we based the search on accuracy. Therefore, no need to continue.

### A Radial Basis Function (RBF) kernel

__Note from the scikit-learn documentation:__

" When training an SVM with the Radial Basis Function (RBF) kernel, two parameters must be considered: __C and gamma__." 

"The parameter C, common to all SVM kernels, trades off misclassification of training examples against performance on test data."

"Gamma defines how much influence a single training example has. The larger gamma is, the closer other examples must be to be affected."

"Intuitively, the gamma parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’"

"Proper choice of C and gamma is critical to the SVM’s performance. One is advised to use GridSearchCV to find C and gamma."

What values of gamma to try?

"In practice, a logarithmic grid from 10^-3 to 10^ 3 is enough"

https://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html

In [None]:
gamma_range = np.logspace(-3, 3, 5)
gamma_range

In [None]:
hyperparam_grid_default_rbf = {
    'svc__C': c_hyperparameter,
    'svc__class_weight': ['balanced'],
    'svc__kernel':['rbf'],
    'svc__gamma': gamma_range
}

In [None]:
grid_search_default_rbf= GridSearchCV(estimator= pipe_default_CVSearch, param_grid=hyperparam_grid_default_rbf, cv=5, scoring='accuracy')

In [None]:
grid_search_default_rbf.fit(X_train_def, y_train_def)

In [None]:
grid_search_default_rbf.best_params_

In [None]:
pipe_default_rbf_svc = make_pipeline(StandardScaler(), SVC(kernel='rbf', C= 2, gamma= 1000, class_weight='balanced'))

In [None]:
pipe_default_rbf_svc.fit(X_train_def, y_train_def)

In [None]:
y_predicted_test_default2= pipe_default_rbf_svc.predict(X_test_def)

In [None]:
confusion_matrix (y_test_def, y_predicted_test_default2)

In [None]:
print (classification_report(y_test_def, y_predicted_test_default2))

__Choosing the hyperparameters based on f1_score__ (DO IT AT HOME)

## SVM applied to the Sonar dataset

### A polynomial kernel

In [None]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/sonar.csv"

In [None]:
sonar_df = pd.read_csv(url, header=None)

In [None]:
sonar_df.info()

In [None]:
X_train_sonar, X_test_sonar, y_train_sonar, y_test_sonar= train_test_split (sonar_df.iloc[:,:-1], sonar_df.iloc[:,-1], test_size=0.2, random_state=1)

In [None]:
pipe_sonar_CVSearch = make_pipeline(StandardScaler(), SVC())

In [None]:
real_c_sonar= np.array ([1/100, 1/50, 1/10, 1/2, 1, 5, 10, 20, 50, 100, 200, 500, 1000])

In [None]:
 c_hyperparameter_sonar= 1/ real_c_sonar

In [None]:
hyperparam_grid_poly_sonar = {
    'svc__C': c_hyperparameter_sonar,
    'svc__class_weight': ['balanced'],
    'svc__degree':[1,2,3,4,5], # degree 1 correspond to the linear kernel
    'svc__kernel':['poly']
}

In [None]:
grid_search_sonar= GridSearchCV(estimator= pipe_sonar_CVSearch, param_grid=hyperparam_grid_poly_sonar, cv= 5, scoring='accuracy')

In [None]:
grid_search_sonar.fit(X_train_sonar, y_train_sonar)

In [None]:
grid_search_sonar.best_params_

In [None]:
pipe_sonar_poly_svc = make_pipeline(StandardScaler(), SVC(kernel='poly', degree=3, C= 1, class_weight='balanced'))

In [None]:
pipe_sonar_poly_svc.fit(X_train_sonar, y_train_sonar)

In [None]:
y_predicted_test_sonar1= pipe_sonar_poly_svc.predict(X_test_sonar)

In [None]:
confusion_matrix (y_test_sonar, y_predicted_test_sonar1)

In [None]:
print (classification_report (y_test_sonar, y_predicted_test_sonar1))

### A Radial Basis Function (RBF) kernel

In [None]:
gamma_range_sonar = np.logspace(-3, 3, 20)
gamma_range_sonar

In [None]:
hyperparam_grid_sonar_rbf = {
    'svc__C': c_hyperparameter_sonar,
    'svc__class_weight': ['balanced'],
    'svc__kernel':['rbf'],
    'svc__gamma': gamma_range
}

In [None]:
grid_search_sonar_rbf= GridSearchCV(estimator= pipe_sonar_CVSearch, param_grid= hyperparam_grid_sonar_rbf, cv=5, scoring='accuracy')

In [None]:
grid_search_sonar_rbf.fit(X_train_sonar, y_train_sonar)

In [None]:
grid_search_sonar_rbf.best_params_

In [None]:
pipe_sonar_rbf_svc = make_pipeline(StandardScaler(), SVC(kernel='rbf', C= 100, gamma= 0.03162277660168379, class_weight='balanced'))

In [None]:
pipe_sonar_rbf_svc.fit(X_train_sonar, y_train_sonar)

In [None]:
y_predicted_test_sonar2= pipe_sonar_rbf_svc.predict(X_test_sonar)

In [None]:
print (classification_report (y_test_sonar, y_predicted_test_sonar2))

## SVM applied to the Tips dataset

__WORK ON THIS INDIVIDUALLY NOW IN CLASS !!!__

EVERYONE MUST WORK ON THIS INDIVIDUALLY !!! 

CODE WILL BE DISCUSSED AND GUIDANCE PROVIDED, BUT CODE WILL NOT BE SHARED !!!

__First)__ Use polynomial kernel and select based on f1-score

__Second)__ Use radial basis functions kernel and select based on f1-score

In [None]:
Tips_df= pd.read_csv('C:\\Users\\jheredi2\\Documents\\PythonDataAnalytics\\1-Datasets\\tips.csv')

In [None]:
Tips_df_dummies= pd.get_dummies(Tips_df,columns=['sex','smoker','day','time'], drop_first=True)

In [None]:
Tips_df_dummies['great_tip']=((Tips_df_dummies['tip']/Tips_df_dummies['total_bill'])>=0.20)*1

In [None]:
Tips_df_dummies.info()

In [None]:
X_train_tip, X_test_tip, y_train_tip, y_test_tip= train_test_split (Tips_df_dummies.iloc[:,np.r_[0, 2:9]], Tips_df_dummies.iloc[:,-1], test_size=0.2, random_state=1)

POLYNOMIAL

RADIAL BASIS FUNCTION