Mame Bou FALL Master 1 IABD 

In [None]:
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

# Librairies scikit-learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    precision_recall_curve, 
    roc_curve, 
    auc
)

class MalwareDetector:
    def __init__(self, data_path):
        """
        Initialisation du détecteur de logiciels malveillants
        
        Args:
            data_path (str): Chemin vers le fichier CSV
        """
        self.data_path = data_path
        self.data = None
        self.X = None
        self.y = None
        self.X_train = None
        self.X_test = None 
        self.y_train = None
        self.y_test = None 
        self.scaler = StandardScaler() # Normalisation
        self.model = None

    def load_data(self):
        """
        Chargement et affichage du dataset
        """
        self.data = pd.read_csv(self.data_path)
        print("Informations du dataset:")
        print(self.data.info())
        
        print("\nRépartition des classes:")
        print(self.data['legitimate'].value_counts(normalize=True))

    def preprocess_data(self):
        """
        Prétraitement des données
        """
        # Séparation features et target
        self.X = self.data.drop(columns=['legitimate'])
        self.y = self.data['legitimate']

        # Normalisation
        X_scaled = self.scaler.fit_transform(self.X)

        # Split train/test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X_scaled, self.y, 
            test_size=0.3, 
            random_state=42, 
            stratify=self.y
        ) 

    def train_decision_tree(self, grid_search=True):
        """
        Entraînement du modèle Decision Tree avec possibilité d'optimisation
        
        Args:
            grid_search (bool): Activer la recherche des meilleurs hyperparamètres
        """
        if grid_search:
            # Définition des hyperparamètres à tester
            param_grid = {
                'criterion': ['gini', 'entropy'],
                'max_depth': [None, 5, 10, 15],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4]
            }

            # Configuration de GridSearchCV
            grid_search = GridSearchCV(
                estimator=DecisionTreeClassifier(random_state=42), 
                param_grid=param_grid, 
                scoring='f1_macro', 
                cv=3, 
                verbose=1
            )

            grid_search.fit(self.X_train, self.y_train)
            
            print("Meilleurs hyperparamètres :", grid_search.best_params_)
            self.model = grid_search.best_estimator_
        
        else:
            # Modèle par défaut
            self.model = DecisionTreeClassifier(random_state=42)
            self.model.fit(self.X_train, self.y_train)

    def evaluate_model(self, plot_curves=True):
        """
        Évaluation du modèle
        
        Args:
            plot_curves (bool): Tracer les courbes de performance
        """
        # Prédictions
        y_pred = self.model.predict(self.X_test)
        
        # Matrice de confusion
        conf_matrix = confusion_matrix(self.y_test, y_pred)
        print("\nMatrice de Confusion:")
        print(conf_matrix)
        
        # Rapport de classification
        class_report = classification_report(self.y_test, y_pred)
        print("\nRapport de Classification:")
        print(class_report)

        if plot_curves:
            # Courbe Precision-Recall
            y_scores = self.model.predict_proba(self.X_test)[:, 1]
            precision, recall, _ = precision_recall_curve(self.y_test, y_scores)
            
            plt.figure(figsize=(10, 5))
            plt.subplot(121)
            plt.plot(recall, precision)
            plt.title('Courbe Precision-Recall')
            plt.xlabel('Recall')
            plt.ylabel('Precision')
            
            # Courbe ROC
            fpr, tpr, _ = roc_curve(self.y_test, y_scores)
            roc_auc = auc(fpr, tpr)
            
            plt.subplot(122)
            plt.plot(fpr, tpr, color='darkorange', label=f'ROC curve (AUC = {roc_auc:.2f})')
            plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
            plt.title('Receiver Operating Characteristic')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            
            plt.tight_layout()
            plt.show()

    def save_model(self, model_path='/Users/mac/Desktop/Master_AIDB_ESP/Semestre_1/IA/Projet1_IA_Mame_Bou_FALL/Malware_Dataset_Extrait.csv', scaler_path='scaler.pkl'):
        """
        Sauvegarde du modèle et du scaler
        """
        joblib.dump(self.model, model_path)
        # joblib.dump(self.scaler, scaler_path)
        print(f"Modèle sauvegardé dans {model_path}")
        print(f"Scaler sauvegardé dans {scaler_path}")

def main():
    data_path = "/Users/mac/Desktop/Projet1_IA_Mame_Bou_FALLL /Malware_Dataset_Extrait.csv"
    
    # Créer l'instance du détecteur
    detector = MalwareDetector(data_path)
    
    # Charger les données
    detector.load_data()
    
    # Prétraitement
    detector.preprocess_data()
    
    # Entraînement avec optimisation des hyperparamètres
    detector.train_decision_tree(grid_search=True)
    
    # Évaluation
    detector.evaluate_model(plot_curves=True)
    
    # Sauvegarde
    detector.save_model()

if __name__ == "__main__":
    main()