# SVM

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_theme(style = "darkgrid", palette = "colorblind")

from sklearn import tree, ensemble, linear_model, svm, neighbors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import f1_score, classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.svm import SVC

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler,  ClusterCentroids
from imblearn.metrics import classification_report_imbalanced, geometric_mean_score

from joblib import dump, load # pour enregistrer et charger les modèles.

from IPython.display import display_html # pour pouvoir afficher deux df côte à côte.

n_coeurs = 4 # spécifie le nombre de coeurs utilisés sur le processeur par les fonctions qui le permettent (logreg, rdf, knn ici)

In [2]:
# Instanciation df
df = pd.read_csv('../../../../data/processed/model_weatherAUS.csv')
df.drop(columns = 'Unnamed: 0', inplace = True)
df['Date'] = pd.to_datetime(df['Date'])

###### Proportions initiales target:
#RainTomorrow
#0    0.778382
#1    0.221618

# Séparation data / target:
X = df.drop(columns = ['RainTomorrow', 'Date']).copy()
y = df['RainTomorrow'].copy()

# Séparation du jeu d'entrainement et du jeu de test:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 123, stratify = y)
# stratify = y assure que la proportion initiale de y se retrouve à l'identique dans les deux échantillons de train et test,
# mais ne permet pas de ré-équilibrer les classes!

# Scale de X_train, X_test:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Oversample avec SMOTE
smote = SMOTE(random_state = 12, n_jobs = n_coeurs)
X_sm, y_sm = smote.fit_resample(X_train, y_train)



In [4]:
clf = SVC()
clf.fit(X_train, y_train)

In [7]:
y_pred_train = clf.predict(X_train)
print("Report Train:", classification_report(y_train, y_pred_train, digits = 2))

y_pred_test = clf.predict(X_test)
print("Report Test:", classification_report(y_test, y_pred_test, digits = 2))

Report Train:               precision    recall  f1-score   support

           0       0.88      0.97      0.92     82189
           1       0.83      0.53      0.64     23401

    accuracy                           0.87    105590
   macro avg       0.85      0.75      0.78    105590
weighted avg       0.87      0.87      0.86    105590

Report Test:               precision    recall  f1-score   support

           0       0.87      0.96      0.92     27397
           1       0.79      0.51      0.62      7800

    accuracy                           0.86     35197
   macro avg       0.83      0.74      0.77     35197
weighted avg       0.86      0.86      0.85     35197



In [6]:
dump(clf, 'saves/model_saves/save_svm.joblib')

['saves/model_saves/save_svm.joblib']