# Discriminant Analysis

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, r2_score, f1_score, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, log_loss
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

bank_path = '/content/gdrive/MyDrive/Datasets/Bankruptcy/Bankruptcy.csv'
Bankruptcy = pd.read_csv(bank_path)
X = Bankruptcy.drop(['NO','YR','D'], axis=1)
y = Bankruptcy['D']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                    random_state=2022, train_size=0.7)

# apply LDA
ld = LinearDiscriminantAnalysis()
ld.fit(X_train, y_train)
y_pred = ld.predict(X_test)
y_pred_prob = ld.predict_proba(X_test)[:,1]
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

# WITH K FOLD
kfold = StratifiedKFold(n_splits=5, 
                        shuffle=True,random_state=2022)

result = cross_val_score(ld, 
                         X,y, 
                         cv=kfold, 
                         scoring='roc_auc')
print(result.mean())

0.725
0.685
0.7969568892645815


In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
# apply QDA 
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)
y_pred_prob = qda.predict_proba(X_test)[:,1]
print(accuracy_score(y_test, y_pred))
print(roc_auc_score(y_test, y_pred_prob))

# WITH K FOLD
kfold = StratifiedKFold(n_splits=5, 
                        shuffle=True,random_state=2022)

result = cross_val_score(qda, 
                         X,y, 
                         cv=kfold, 
                         scoring='roc_auc')
print(result.mean())

0.8
0.8062499999999999
0.821301775147929


We observed that results were better for QDA


In [None]:
# another example for ovr and multinomial 
from sklearn.preprocessing import LabelEncoder
vehi_path = '/content/gdrive/MyDrive/Datasets/Vehicle Silhouettes/Vehicle.csv'
veh_sil = pd.read_csv(vehi_path)

X = veh_sil.drop('Class',axis=1)
y = veh_sil['Class']


le = LabelEncoder()
le_y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, le_y, stratify = y, 
                                                    random_state=2022, train_size=0.7)
#LDA 
print("LDA Results: ")
ld = LinearDiscriminantAnalysis()
ld.fit(X_train, y_train)
y_pred = ld.predict(X_test)
y_pred_prob = ld.predict_proba(X_test)
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred_prob))

# kfold for lda
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)

result = cross_val_score(ld, 
                         X,y, 
                         cv=kfold, 
                         scoring='neg_log_loss')
print(result.mean())


#QDA
print("\nQDA Results:")
# apply QDA 
qda = QuadraticDiscriminantAnalysis()
qda.fit(X_train, y_train)
y_pred = qda.predict(X_test)
y_pred_prob = qda.predict_proba(X_test)
print(accuracy_score(y_test, y_pred))
print(log_loss(y_test, y_pred_prob))

# WITH K FOLD
kfold = StratifiedKFold(n_splits=5, 
                        shuffle=True,random_state=2022)
result = cross_val_score(qda, 
                         X,y, 
                         cv=kfold, 
                         scoring='neg_log_loss')
print(result.mean())

LDA Results: 
0.7834645669291339
0.4855472216763664
-0.485547869894297

QDA Results:
0.8464566929133859
0.38583457216143385
-0.4107545172345975


again we find -0.41 > -0.48 
so QDA is better

# SVM

1. with linear
2. with polynomial use poly kernel
3. with radial use rbf with gamma np.linspace(0.001, 10,20)

In [27]:
from sklearn.svm import SVC

bank_path = '/content/gdrive/MyDrive/Datasets/Bankruptcy/Bankruptcy.csv'
Bankruptcy = pd.read_csv(bank_path)
X = Bankruptcy.drop(['NO','YR','D'], axis=1)
y = Bankruptcy['D']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                    random_state=2022, train_size=0.7)


svm = SVC(kernel = 'linear', probability = True, random_state = 2022)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print(accuracy_score(y_test, y_pred))

y_pred_prob = svm.predict_proba(X_test)[:,1]
print(roc_auc_score(y_test, y_pred_prob))


0.825
0.7875


In [29]:
# with GridSearch CV

from sklearn.model_selection import GridSearchCV

params = {'C': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'linear', probability = True, random_state = 2022)
gcv = GridSearchCV(svm, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END ............................................C=0.001; total time=   0.0s
[CV] END ............................................C=0.001; total time=   0.0s
[CV] END ............................................C=0.001; total time=   0.0s
[CV] END ............................................C=0.001; total time=   0.0s
[CV] END ............................................C=0.001; total time=   0.0s
[CV] END ...............................C=0.5272631578947369; total time=   0.1s
[CV] END ...............................C=0.5272631578947369; total time=   0.1s
[CV] END ...............................C=0.5272631578947369; total time=   0.1s
[CV] END ...............................C=0.5272631578947369; total time=   0.0s
[CV] END ...............................C=0.5272631578947369; total time=   0.0s
[CV] END ...............................C=1.0535263157894736; total time=   0.1s
[CV] END ...............................C=1.053

In [30]:
# with GridSearch CV with poly

from sklearn.model_selection import GridSearchCV

params = {'C': np.linspace(0.001, 10, 20),
          'degree': [2,3,4],
          'coef0': np.linspace(-2,4,5)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
gcv = GridSearchCV(svm, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV] END ......................C=0.001, coef0=-2.0, degree=2; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=2; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=2; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=2; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=2; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=3; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=3; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=3; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=3; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=3; total time=   0.0s
[CV] END ......................C=0.001, coef0=-2.0, degree=4; total time=   0.0s
[CV] END ......................C=0.001, coef0

In [32]:
# with GridSearch CV with radial

from sklearn.model_selection import GridSearchCV

params = {'C': np.linspace(0.001, 10, 20),
          'gamma': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'rbf', probability = True, random_state = 2022)
gcv = GridSearchCV(svm, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[CV] END ...............................C=0.001, gamma=0.001; total time=   0.0s
[CV] END ...............................C=0.001, gamma=0.001; total time=   0.0s
[CV] END ...............................C=0.001, gamma=0.001; total time=   0.0s
[CV] END ...............................C=0.001, gamma=0.001; total time=   0.0s
[CV] END ...............................C=0.001, gamma=0.001; total time=   0.0s
[CV] END ..................C=0.001, gamma=0.5272631578947369; total time=   0.0s
[CV] END ..................C=0.001, gamma=0.5272631578947369; total time=   0.0s
[CV] END ..................C=0.001, gamma=0.5272631578947369; total time=   0.0s
[CV] END ..................C=0.001, gamma=0.5272631578947369; total time=   0.0s
[CV] END ..................C=0.001, gamma=0.5272631578947369; total time=   0.0s
[CV] END ..................C=0.001, gamma=1.0535263157894736; total time=   0.0s
[CV] END ..................C=0.001, gamma=1.0

In [35]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
svm = SVC(kernel = 'linear', probability = True, random_state =2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'linear', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)
print()




{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='linear', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='linear', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'linear', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END ................................

In [39]:
# with poly scaling
scaler = StandardScaler()
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__degree': [2,3,4],
          'SVM__coef0': np.linspace(-2,4,5)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)
print()




{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='poly', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='poly', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'poly', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0

In [43]:
# with radial scaling
scaler = StandardScaler()
svm = SVC(kernel = 'rbf', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__gamma': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)

gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=1.0535263157894736; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=1.0

## Kyphosis dataset

4 cols
present/abset, Age, Number, Start

In [46]:
kyp_path = '/content/gdrive/MyDrive/Datasets/Kyphosis/Kyphosis.csv'
Kyphosis = pd.read_csv(kyp_path)
dum_kyp = pd.get_dummies(Kyphosis, drop_first=True)
dum_kyp.columns


Index(['Age', 'Number', 'Start', 'Kyphosis_present'], dtype='object')

In [48]:
X = dum_kyp.drop(['Kyphosis_present'], axis=1)
y = dum_kyp['Kyphosis_present']
scaler = StandardScaler()
svm = SVC(kernel = 'linear', probability = True, random_state =2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'linear', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)
print()

{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='linear', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='linear', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'linear', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END .......................................SVM__C=0.001; total time=   0.0s
[CV] END ................................

In [49]:
# with poly scaling
scaler = StandardScaler()
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__degree': [2,3,4],
          'SVM__coef0': np.linspace(-2,4,5)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)
print()




{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='poly', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='poly', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'poly', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0, SVM__degree=2; total time=   0.0s
[CV] END .......SVM__C=0.001, SVM__coef0=-2.0

In [50]:
# with radial scaling
scaler = StandardScaler()
svm = SVC(kernel = 'rbf', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__gamma': np.linspace(0.001, 10, 20)}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)

gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'roc_auc')
gcv.fit(X, y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 400 candidates, totalling 2000 fits
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END .....................SVM__C=0.001, SVM__gamma=0.001; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=1.0535263157894736; total time=   0.0s
[CV] END ........SVM__C=0.001, SVM__gamma=1.0

# SVM for more than 2 classes
1. One Versus One classification ( ovo ) 
2. One Versus All classification ( ova )

In [54]:
from sklearn.preprocessing import LabelEncoder
img_path = '/content/gdrive/MyDrive/Datasets/Image Segmentation/Image_Segmention.csv'
img_seg = pd.read_csv(img_path)

X = img_seg.drop('Class',axis=1)
y = img_seg['Class']

le = LabelEncoder()
le_y = le.fit_transform(y)

scaler = StandardScaler()
svm = SVC(kernel = 'linear', probability = True, random_state =2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20), 'SVM__decision_function_shape':['ovo', 'ovr']}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'linear', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'neg_log_loss')
gcv.fit(X, le_y)
print(gcv.best_params_)
print(gcv.best_score_)
print()

{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='linear', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='linear', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'linear', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV] END .....SVM__C=0.001, SVM__decision_function_shape=ovo; total time=   0.0s
[CV] END .....SVM__C=0.001, SVM__decision_function_shape=ovo; total time=   0.0s
[CV] END .....SVM__C=0.001, SVM__decision_function_shape=ovo; total time=   0.0s
[CV] END .....SVM__C=0.001, SVM__decision

In [56]:
scaler = StandardScaler()
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])
print(pipe.get_params())

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__degree': [2,3,4],
          'SVM__coef0': np.linspace(-2,4,5),
          'SVM__decision_function_shape':['ovo', 'ovr']}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)
svm = SVC(kernel = 'poly', probability = True, random_state = 2022)
gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'neg_log_loss')
gcv.fit(X, le_y)
print(gcv.best_params_)
print(gcv.best_score_)
print()

{'memory': None, 'steps': [('STD', StandardScaler()), ('SVM', SVC(kernel='poly', probability=True, random_state=2022))], 'verbose': False, 'STD': StandardScaler(), 'SVM': SVC(kernel='poly', probability=True, random_state=2022), 'STD__copy': True, 'STD__with_mean': True, 'STD__with_std': True, 'SVM__C': 1.0, 'SVM__break_ties': False, 'SVM__cache_size': 200, 'SVM__class_weight': None, 'SVM__coef0': 0.0, 'SVM__decision_function_shape': 'ovr', 'SVM__degree': 3, 'SVM__gamma': 'scale', 'SVM__kernel': 'poly', 'SVM__max_iter': -1, 'SVM__probability': True, 'SVM__random_state': 2022, 'SVM__shrinking': True, 'SVM__tol': 0.001, 'SVM__verbose': False}
Fitting 5 folds for each of 600 candidates, totalling 3000 fits
[CV] END SVM__C=0.001, SVM__coef0=-2.0, SVM__decision_function_shape=ovo, SVM__degree=2; total time=   0.1s
[CV] END SVM__C=0.001, SVM__coef0=-2.0, SVM__decision_function_shape=ovo, SVM__degree=2; total time=   0.1s
[CV] END SVM__C=0.001, SVM__coef0=-2.0, SVM__decision_function_shape=ovo

In [60]:
# with radial scaling
scaler = StandardScaler()
svm = SVC(kernel = 'rbf', probability = True, random_state = 2022)
pipe = Pipeline([('STD', scaler),('SVM',svm)])

params = {'SVM__C': np.linspace(0.001, 10, 20),
          'SVM__gamma': np.linspace(0.001, 10, 20),
          'SVM__decision_function_shape':['ovo','ovr']}
kfold = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 2022)

gcv = GridSearchCV(pipe, param_grid = params, cv = kfold, verbose = 2, scoring = 'neg_log_loss')
gcv.fit(X, le_y)
print(gcv.best_params_)
print(gcv.best_score_)

Fitting 5 folds for each of 800 candidates, totalling 4000 fits
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.001; total time=   0.0s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.001; total time=   0.1s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.001; total time=   0.1s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.001; total time=   0.0s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.001; total time=   0.0s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.5272631578947369; total time=   0.0s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.5272631578947369; total time=   0.1s
[CV] END SVM__C=0.001, SVM__decision_function_shape=ovo, SVM__gamma=0.5272631578947369; total time=   0.1s
[CV] END SVM__C=0.001, SVM__decision_f

Comparing the three we find that 


> Radial: {'SVM__C': 10.0, 'SVM__decision_function_shape': 'ovo', 'SVM__gamma': 0.001} -0.5034442026751786

> Polynomial: {'SVM__C': 4.211105263157895, 'SVM__coef0': 1.0, 'SVM__decision_function_shape': 'ovo', 'SVM__degree': 3}-0.39393170389827276

> Linear: {'SVM__C': 1.0535263157894736, 'SVM__decision_function_shape': 'ovo'}
-0.40457971833739437 




# Satelite Imaging
1. kfold on LDA and QDA
2. give scores (log_loss)
3. GaussianNB

In [62]:
from sklearn.preprocessing import LabelEncoder
sat_path = '/content/gdrive/MyDrive/Datasets/Satellite Imaging/Satellite.csv'
sat = pd.read_csv(sat_path, sep = ';')

sat.info()
# we will have to use startified k fold

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6435 entries, 0 to 6434
Data columns (total 37 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   x.1      6435 non-null   int64 
 1   x.2      6435 non-null   int64 
 2   x.3      6435 non-null   int64 
 3   x.4      6435 non-null   int64 
 4   x.5      6435 non-null   int64 
 5   x.6      6435 non-null   int64 
 6   x.7      6435 non-null   int64 
 7   x.8      6435 non-null   int64 
 8   x.9      6435 non-null   int64 
 9   x.10     6435 non-null   int64 
 10  x.11     6435 non-null   int64 
 11  x.12     6435 non-null   int64 
 12  x.13     6435 non-null   int64 
 13  x.14     6435 non-null   int64 
 14  x.15     6435 non-null   int64 
 15  x.16     6435 non-null   int64 
 16  x.17     6435 non-null   int64 
 17  x.18     6435 non-null   int64 
 18  x.19     6435 non-null   int64 
 19  x.20     6435 non-null   int64 
 20  x.21     6435 non-null   int64 
 21  x.22     6435 non-null   int64 
 22  

In [67]:
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score

X = sat.drop('classes',axis=1)
y = sat['classes']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, 
                                                    random_state=2022, train_size=0.7)

le = LabelEncoder()
le_y = le.fit_transform(y)


ld = LinearDiscriminantAnalysis()
qda = QuadraticDiscriminantAnalysis()
gnb = GaussianNB()

# LDA with K FOLD
kfold = StratifiedKFold(n_splits=5, 
                        shuffle=True,random_state=2022)
result = cross_val_score(ld, 
                         X,le_y, 
                         cv=kfold, 
                         scoring='neg_log_loss')
print(result.mean())


# QDA with K FOLD
result = cross_val_score(qda, 
                         X,le_y, 
                         cv=kfold, 
                         scoring='neg_log_loss')
print(result.mean())


# GNB with K FOLD
result = cross_val_score(gnb, 
                         X,le_y, 
                         cv=kfold, 
                         scoring='neg_log_loss')
print(result.mean())

# results
# -0.5847691322125739
# -0.8836889478519823
# -3.944449917120699

-0.5847691322125739
-0.8836889478519823
-3.944449917120699
