In [1]:

# https://stackabuse.com/implementing-svm-and-kernel-svm-with-pythons-scikit-learn/
# https://scikit-learn.org/stable/modules/svm.html
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html


In [2]:
%load_ext autoreload

In [13]:
%autoreload 1

import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.svm import SVC

from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedShuffleSplit

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler

from src.displayResults import DisplayResults

from IPython.display import display


In [14]:
"""""""""""""""""""""""
" Constantes
"""""""""""""""""""""""

# Le fichier CSV contenant les données galaxies
csv_vectors = 'data/processed/galaxy_feature_vectors.csv'

kfold = 5   
ram = 1048   # Taille de la memoire vive
size = 0.2   # Taille des donnees  
nb_exec = 5  # Proportionnel au nb. de coeurs x86 de la machine

# Memoire vive à utiliser
cache = SVC(cache_size = ram)

# Variables et gamma
C = [1e-3, 1e-1, 1, 10]
gamma = C

# Facilite simplement l'affichage du tableau des resultats
variables_c = ['1e-3', '1e-1', '1.0', '10.0']

# DisplayResults possède la code pour afficher les tableaux
dr = DisplayResults()


In [15]:
"""""""""""""""""""""""
" Lecture fichiers CSV
"""""""""""""""""""""""
# Recupere les données galaxies (saute la premiere et derniere colonne --> ID galaxie)
X_galaxy = pd.read_csv(csv_vectors, header=None).values[:,1:-1]

# Recupere la colonne indiquant si c'est une galaxie spiral (1) ou smooth (0)
Y_galaxy = pd.read_csv(csv_vectors, header=None).values[:,-1:].astype(int).flatten()


In [16]:
"""""""""""""""""""""""
" Normalisation
"""""""""""""""""""""""

scaler = StandardScaler()

X_galaxy = scaler.fit_transform(X_galaxy)

In [17]:

def get_train_split_indices(x, test_size=0.2):
    
    x_train, x_test = train_test_split(x, test_size=test_size, shuffle=True)
    
    train_indexes = [x.index(elem) for elem in x_train]
    test_indexes = [x.index(elem) for elem in x_test]

    validator = [(train_indexes, test_indexes)]
    return validator

def svm_grid_search(X, Y, tuned_parameters):

    validator = get_train_split_indices(X.tolist())
    
    tuned_model = GridSearchCV(
        cache, 
        cv=validator, 
        verbose=4, 
        n_jobs=-1, 
        scoring=['accuracy', 'f1'], 
        refit='f1',
        return_train_score=True,
        param_grid=tuned_parameters
    )
    
    tuned_model.fit(X, Y)
    
    return tuned_model
    

"""""""""""""""""""""""
" GridSearch SVM linear
"""""""""""""""""""""""
def linear_grid_search(X, Y):
    """
    Fonction qui effectue un 'Grid Search' linéaire SVM 
    sur le jeu de données d'apprentissage.
    
    args:
        X:  matrice des données galaxies
        Y:  liste des étiquettes (type de galaxie)
    
    returns:
        La matrice SVM lineaire
    """
    
    tuned_parameters = {'kernel': ['linear'], 
                        'C': C,
                        'class_weight': ['balanced'],
                        'gamma': ['scale']}
    
    return svm_grid_search(X, Y, tuned_parameters)



"""""""""""""""""""""""""""
" GridSearch SVM N-linear
"""""""""""""""""""""""""""
def rbf_grid_search(X,Y):
    """
    Fonction qui effectue un 'Grid Search' non-linéaire SVM (rbf)
    sur le jeu de données d'apprentissage.
    
    args:
        X:  matrice des données galaxies
        Y:  liste des étiquettes (type de galaxie)
    
    returns:
        La matrice SVM non-lineaire (rbf)
    """
    
    tuned_parameters = {'kernel': ['rbf'], 
                        'C': C, 
                        'gamma': gamma}
    
    return svm_grid_search(X, Y, tuned_parameters)


In [20]:
"""""""""""""""""""""""""""
" Recupere les matrices SVM
"""""""""""""""""""""""""""

# Recupere les resultats SVM lineaire
linear_matrix = linear_grid_search(X_galaxy, Y_galaxy)

# Recupere les resultats SVM non-lineaire (rbf)
rbf_matrix = rbf_grid_search(X_galaxy, Y_galaxy)


Fitting 1 folds for each of 4 candidates, totalling 4 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed:   28.2s finished


Fitting 1 folds for each of 16 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   6 out of  16 | elapsed:  3.6min remaining:  5.9min
[Parallel(n_jobs=-1)]: Done  11 out of  16 | elapsed:  3.7min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  5.4min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed:  5.4min finished


In [37]:
%aimport src.displayResults

"""""""""""""""""""""""""""""""""""""""
" Affiche les resultats SVM lineaire
"""""""""""""""""""""""""""""""""""""""

print("***************************************************")
print("* SVM LINEAIRE")
print("*\n* Best accuracy: ", linear_matrix.best_score_)
print("* Best params. : ", linear_matrix.best_params_)
print("***************************************************")

train_accuracy = linear_matrix.cv_results_['mean_train_accuracy']
test_accuracy = linear_matrix.cv_results_['mean_test_accuracy']
train_f1 = linear_matrix.cv_results_['mean_train_f1']
test_f1 = linear_matrix.cv_results_['mean_test_f1']
fit_time = linear_matrix.cv_results_['mean_fit_time']

dr.svm_linear_table(
    variables_c, 
    train_accuracy, 
    test_accuracy,
    train_f1,
    test_f1,
    fit_time
)


***************************************************
* SVM LINEAIRE
*
* Best accuracy:  0.9529709940861729
* Best params. :  {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}
***************************************************


Unnamed: 0,C,Train Accuracy,Test Accuracy,Train F1,Test F1,Fitting Time (sec)
0,0.001,0.93479225,0.93287995,0.93654676,0.93596615,10.879574
1,0.1,0.95061363,0.94914252,0.95194245,0.95149464,7.4746654
2,1.0,0.95246193,0.95062093,0.95377759,0.95294449,10.395484
3,10.0,0.95231406,0.95062093,0.95372031,0.95297099,24.747364


In [44]:
%aimport src.displayResults

"""""""""""""""""""""""""""""""""""""""
" Affiche les resultats SVM N-lineaire
"""""""""""""""""""""""""""""""""""""""

print("***************************************************")
print("* SVM NON-LINEAIRE (RBF)")
print("*\n* Best accuracy: ", rbf_matrix.best_score_)
print("* Best params. : ", rbf_matrix.best_params_)
print("***************************************************")

train_accuracy = rbf_matrix.cv_results_['mean_train_accuracy']
test_accuracy = rbf_matrix.cv_results_['mean_test_accuracy']
train_f1 = rbf_matrix.cv_results_['mean_train_f1']
test_f1 = rbf_matrix.cv_results_['mean_test_f1']
fit_time = rbf_matrix.cv_results_['mean_fit_time']

print("\n *** Sigma Train Accuracy ***")
dr.svm_rbf_table(variables_c , train_accuracy)

print("\n *** Sigma Test Accuracy ***")
dr.svm_rbf_table(variables_c , test_accuracy)

print("\n *** Sigma Train F1 ***")
dr.svm_rbf_table(variables_c , train_f1)

print("\n *** Sigma Test F1 ***")
dr.svm_rbf_table(variables_c , test_f1)

print("\n *** Sigma Fitting Time (sec) ***")
dr.svm_rbf_table(variables_c , fit_time)


***************************************************
* SVM NON-LINEAIRE (RBF)
*
* Best accuracy:  0.956248212753789
* Best params. :  {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
***************************************************

 *** Sigma Train Accuracy ***


Unnamed: 0,C/sigma,1e-3,1e-1,1.0,10.0
0,0.001,0.5193701,0.5193701,0.5193701,0.5193701
1,0.1,0.91431317,0.91083839,0.5193701,0.5193701
2,1.0,0.94928286,0.99208931,1.0,1.0
3,10.0,0.96229484,1.0,1.0,1.0



 *** Sigma Test Accuracy ***


Unnamed: 0,C/sigma,1e-3,1e-1,1.0,10.0
0,0.001,0.51774098,0.51774098,0.51774098,0.51774098
1,0.1,0.91927853,0.89769367,0.51774098,0.51774098
2,1.0,0.94589001,0.94056771,0.60733294,0.51774098
3,10.0,0.9547605,0.94382022,0.62921348,0.51774098



 *** Sigma Train F1 ***


Unnamed: 0,C/sigma,1e-3,1e-1,1.0,10.0
0,0.001,0.68366503,0.68366503,0.68366503,0.68366503
1,0.1,0.9175617,0.91961072,0.68366503,0.68366503
2,1.0,0.9511396,0.99239463,1.0,1.0
3,10.0,0.96368556,1.0,1.0,1.0



 *** Sigma Test F1 ***


Unnamed: 0,C/sigma,1e-3,1e-1,1.0,10.0
0,0.001,0.68225209,0.68225209,0.68225209,0.68225209
1,0.1,0.92143885,0.90797872,0.68225209,0.68225209
2,1.0,0.94751936,0.94405789,0.72493786,0.68225209
3,10.0,0.95624821,0.94692737,0.73611111,0.68225209



 *** Sigma Fitting Time (sec) ***


Unnamed: 0,C/sigma,1e-3,1e-1,1.0,10.0
0,0.001,89.626214,95.767437,70.56345,91.161462
1,0.1,36.567384,39.637987,89.096722,87.614703
2,1.0,19.565106,34.689949,96.673056,85.436692
3,10.0,14.802909,24.690998,52.605923,50.380447


In [22]:

# Exécution du modèle linéaire et avec RBF pour 100 %, 50 % et 25 % des données


linear_params = {
    'kernel': ['linear'], 
    'C': [1],
    'class_weight': ['balanced'],
    'gamma': ['scale']
}

rbf_params = {
    'kernel': ['rbf'], 
    'C': [10], 
    'gamma': [1e-3]
}

nb_x = len(X_galaxy)

linear_model_100p = svm_grid_search(X_galaxy, Y_galaxy, linear_params)
linear_model_50p = svm_grid_search(X_galaxy[:int(nb_x*0.50)], Y_galaxy[:int(nb_x*0.50)], linear_params)
linear_model_25p = svm_grid_search(X_galaxy[:int(nb_x*0.25)], Y_galaxy[:int(nb_x*0.25)], linear_params)

rbf_model_100p = svm_grid_search(X_galaxy, Y_galaxy, rbf_params)
rbf_model_50p = svm_grid_search(X_galaxy[:int(nb_x*0.50)], Y_galaxy[:int(nb_x*0.50)], rbf_params)
rbf_model_25p = svm_grid_search(X_galaxy[:int(nb_x*0.25)], Y_galaxy[:int(nb_x*0.25)], rbf_params)

Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   12.4s finished


Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.1s finished


Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.8s finished


Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   10.4s finished


Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    2.9s finished


Fitting 1 folds for each of 1 candidates, totalling 1 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.8s finished


In [24]:
# Affichage des résultats de l'exécution avec 100 %, 50 % et 25 % des données

def format_results(model, run_name):
    
    return {
        'run name': run_name,
        'train accuracy': model.cv_results_['mean_train_accuracy'][0],
        'test accuracy': model.cv_results_['mean_test_accuracy'][0],
        'train F1': model.cv_results_['mean_train_f1'][0],
        'test F1': model.cv_results_['mean_test_f1'][0],
        'train time': model.cv_results_['mean_fit_time'][0]
    }

results = []

results.append(format_results(linear_model_100p, 'linear 100%'))
results.append(format_results(linear_model_50p, 'linear 50%'))
results.append(format_results(linear_model_25p, 'linear 25%'))
results.append(format_results(rbf_model_100p, 'RBF 100%'))
results.append(format_results(rbf_model_50p, 'RBF 50%'))
results.append(format_results(rbf_model_25p, 'RBF 25%'))

display(pd.DataFrame(results).transpose())

Unnamed: 0,0,1,2,3,4,5
run name,linear 100%,linear 50%,linear 25%,RBF 100%,RBF 50%,RBF 25%
test F1,0.947644,0.947964,0.958378,0.958766,0.956272,0.954955
test accuracy,0.946777,0.945594,0.953901,0.956535,0.955056,0.952719
train F1,0.954396,0.955527,0.949796,0.962466,0.958946,0.955053
train accuracy,0.952684,0.953423,0.949127,0.961112,0.956676,0.953564
train time,7.6606,1.67063,0.425889,4.17133,1.01289,0.287846
