# Méthodes d'Ensembles


In [14]:
from prepdata import *
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC

In [15]:
# Récupérer les données du dataset abalone8
X, y = data_recovery("autompg")

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

print(np.sum(X))
print(np.sum(y))

1331586.5
147


In [16]:
# Initialisation de StratifiedKFold
skf = StratifiedKFold(n_splits=5)

# Liste de différents C (lambda = 1 / C) à tester
C_values = [i for i in range (25,10001,25)]

# Stocker les résultats d'accuracy pour chaque valeur de C
best_C = None
best_accuracy = 0
C_accuracy_scores = {}

# Parcours de chaque valeur de C
for C in C_values:
    print(f"Testing SVM with C = {C}")
    
    accuracy_scores = []  # Stocker les scores d'accuracy pour chaque fold
    
    # Parcours des folds pour la cross-validation
    for train_index, val_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
        y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]
        
        # Normalisation sur les k-1 folds (uniquement les données d'entraînement)
        scaler = Normalizer()
        X_train_fold = scaler.fit_transform(X_train_fold)
        
        # Normaliser les données du fold de validation avec les paramètres du train_fold
        X_val_fold = scaler.transform(X_val_fold)
        
        # Initialiser le modèle SVM avec kernel linéaire pour cette valeur de C
        # clf = SVC(C=C, gamma=10, kernel='rbf')
        clf = SVC(C=C, kernel='linear')
        
        # Entraînement du modèle sur le fold courant
        clf.fit(X_train_fold, y_train_fold)
        
        # Prédiction sur les données de validation
        y_pred_fold = clf.predict(X_val_fold)
        
        # Calcul de l'accuracy sur le fold de validation
        accuracy = accuracy_score(y_val_fold, y_pred_fold)
        accuracy_scores.append(accuracy)

    # Calcul de l'accuracy moyenne pour cette valeur de C
    mean_accuracy = np.mean(accuracy_scores)
    C_accuracy_scores[C] = mean_accuracy
    
    print(f"Mean cross-validated accuracy for C={C}: {mean_accuracy:.6f}")
    
    # Mettre à jour le meilleur C en fonction de l'accuracy
    if mean_accuracy > best_accuracy:
        best_accuracy = mean_accuracy
        best_C = C

# Afficher le meilleur C et son accuracy
print(f"\nBest C: {best_C} with cross-validated accuracy: {best_accuracy:.6f}")

Testing SVM with C = 25
Mean cross-validated accuracy for C=25: 0.874167
Testing SVM with C = 50
Mean cross-validated accuracy for C=50: 0.833197
Testing SVM with C = 75
Mean cross-validated accuracy for C=75: 0.833255
Testing SVM with C = 100
Mean cross-validated accuracy for C=100: 0.833197
Testing SVM with C = 125
Mean cross-validated accuracy for C=125: 0.843366
Testing SVM with C = 150
Mean cross-validated accuracy for C=150: 0.843366
Testing SVM with C = 175
Mean cross-validated accuracy for C=175: 0.846815
Testing SVM with C = 200
Mean cross-validated accuracy for C=200: 0.846815
Testing SVM with C = 225
Mean cross-validated accuracy for C=225: 0.846815
Testing SVM with C = 250
Mean cross-validated accuracy for C=250: 0.850263
Testing SVM with C = 275
Mean cross-validated accuracy for C=275: 0.853653
Testing SVM with C = 300
Mean cross-validated accuracy for C=300: 0.853653
Testing SVM with C = 325
Mean cross-validated accuracy for C=325: 0.857043
Testing SVM with C = 350
Mean c

In [17]:
# Entraînement du modèle final avec le meilleur C sur l'ensemble complet d'entraînement
scaler = Normalizer()  # Normalisation pour le modèle final
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf_final = SVC(C=best_C, kernel='linear')
clf_final.fit(X_train, y_train)

# Prédiction finale sur l'ensemble de test
y_test_pred = clf_final.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)

print(f"Test accuracy with best C={best_C}: {test_accuracy:.6f}")

Test accuracy with best C=1200: 0.887755


In [18]:
# Affichage de la matrice de confusion
conf_matrix = confusion_matrix(y_test, y_test_pred)
print(f"True Positives : {conf_matrix[0][0]}")
print(f"True Negatives : {conf_matrix[1][1]}")
print(f"False Positives : {conf_matrix[0][1]}")
print(f"False Negatives : {conf_matrix[1][0]}")

print("\nConfusion Matrix :\n", conf_matrix)

True Positives : 51
True Negatives : 36
False Positives : 10
False Negatives : 1

Confusion Matrix :
 [[51 10]
 [ 1 36]]
