In [1]:
pip install pandas numpy matplotlib seaborn ipython scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Markdown
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Charger le fichier CSV
df = pd.read_csv('signal_features_dataset.csv')

display(Markdown("### Dataset chargé avec succès"))
display(Markdown(f"Nombre de lignes : {len(df)}"))

### Dataset chargé avec succès

Nombre de lignes : 2452

In [4]:
# Informations sur le dataset
display(Markdown("### Informations sur le dataset :"))
# Note : df.info() affiche du texte brut, on contourne avec un résumé
df_info = pd.DataFrame({
    'Column': df.columns,
    'Type': [str(t) for t in df.dtypes],
    'Non-Null Count': df.notnull().sum()
})
display(df_info)

# Statistiques descriptives
display(Markdown("### Statistiques descriptives :"))
display(df.describe())

# Vérifier les valeurs manquantes
display(Markdown("### Valeurs manquantes par colonne :"))
display(df.isnull().sum())

### Informations sur le dataset :

Unnamed: 0,Column,Type,Non-Null Count
Mean,Mean,float64,2452
Std,Std,float64,2452
Max,Max,float64,2452
Min,Min,float64,2452
Kurtosis,Kurtosis,float64,2452
Skew,Skew,float64,2452
Dominant Frequency,Dominant Frequency,float64,2452
Dominant Amplitude,Dominant Amplitude,float64,2452
Spectral Energy,Spectral Energy,float64,2452
Envelope Mean,Envelope Mean,float64,2452


### Statistiques descriptives :

Unnamed: 0,Mean,Std,Max,Min,Kurtosis,Skew,Dominant Frequency,Dominant Amplitude,Spectral Energy,Envelope Mean,Envelope Std,Envelope Dominant Frequency,Envelope Dominant Amplitude,Label
count,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0,2452.0
mean,0.018401,0.207349,2.507323,-2.407303,18.074378,0.028804,6682.083197,1257.152018,19242110.0,0.206028,0.19593,0.0,5274.32148,0.458401
std,0.021847,0.118757,2.622644,2.607864,25.870715,0.27762,5811.59113,1020.833093,14419720.0,0.105304,0.151942,0.0,2695.790198,0.498368
min,-0.003799,0.027759,0.086725,-9.065726,-0.823566,-1.098744,0.0,219.201803,261781.4,0.034039,0.019915,0.0,871.410187,0.0
25%,-0.003661,0.029061,0.110891,-3.664244,-0.792097,-0.125999,453.0,229.763897,285762.8,0.035615,0.020917,0.0,911.750013,0.0
50%,0.013845,0.223252,0.573921,-0.505043,0.300927,-0.006182,11972.0,963.644565,17384120.0,0.256696,0.117836,0.0,6571.417372,0.0
75%,0.040658,0.271391,3.976952,-0.130193,19.638894,0.074189,12032.0,2795.200878,25211240.0,0.284692,0.285304,0.0,7288.111012,1.0
max,0.049828,0.392832,8.531739,-0.106221,84.596324,1.726174,12032.0,2947.31445,50573190.0,0.29934,0.482382,0.0,7663.098812,1.0


### Valeurs manquantes par colonne :

Mean                           0
Std                            0
Max                            0
Min                            0
Kurtosis                       0
Skew                           0
Dominant Frequency             0
Dominant Amplitude             0
Spectral Energy                0
Envelope Mean                  0
Envelope Std                   0
Envelope Dominant Frequency    0
Envelope Dominant Amplitude    0
Label                          0
Signal Name                    0
dtype: int64

In [5]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from IPython.display import display, Markdown

# Sélectionner les colonnes numériques
numeric_cols = df.select_dtypes(include=[np.number]).columns
df_numeric = df[numeric_cols]
corr_matrix = df_numeric.corr()

# Créer une heatmap avec Plotly
fig = go.Figure(data=go.Heatmap(
    z=corr_matrix.values,  # Valeurs de la matrice de corrélation
    x=corr_matrix.columns,  # Noms des colonnes (axe x)
    y=corr_matrix.columns,  # Noms des lignes (axe y)
    colorscale='RdBu',  # Équivalent de 'coolwarm' (rouge-bleu)
    zmin=-1, zmax=1,  # Limites pour l'échelle de couleur
    text=corr_matrix.values,  # Valeurs à afficher dans les cellules
    texttemplate="%{text:.2f}",  # Format des valeurs (2 décimales)
    textfont={"size": 10},  # Taille de la police des annotations
    colorbar=dict(title="Corrélation"),  # Titre de la barre de couleur
))

# Mettre à jour la mise en page
fig.update_layout(
    title="Matrice de corrélation des caractéristiques numériques",
    width=800,  # Largeur de la figure
    height=600,  # Hauteur de la figure
    xaxis_title="Caractéristiques",
    yaxis_title="Caractéristiques",
    xaxis=dict(tickangle=45),  # Rotation des étiquettes sur l'axe x
)

# Afficher la figure
fig.show()

In [172]:
threshold_high = 0.7
high_corr_pairs = []
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > threshold_high:
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

display(Markdown("### Variables très corrélées entre elles (|corr| > 0.7) :"))
if high_corr_pairs:
    display(pd.DataFrame(high_corr_pairs, columns=['Variable 1', 'Variable 2', 'Corrélation']))
else:
    display(Markdown("- Aucune détectée"))

### Variables très corrélées entre elles (|corr| > 0.7) :

Unnamed: 0,Variable 1,Variable 2,Corrélation
0,Mean,Dominant Amplitude,0.740196
1,Std,Max,0.831877
2,Std,Min,-0.817131
3,Std,Kurtosis,0.740425
4,Std,Dominant Frequency,-0.747951
5,Std,Spectral Energy,0.972963
6,Std,Envelope Mean,0.877358
7,Std,Envelope Std,0.920952
8,Std,Envelope Dominant Amplitude,0.877358
9,Std,Label,0.760535


In [173]:
threshold_low = 0.3
label_corr = corr_matrix['Label'].drop('Label')
low_corr_vars = label_corr[abs(label_corr) < threshold_low]

display(Markdown("### Variables peu corrélées avec Label (|corr| < 0.3) :"))
if not low_corr_vars.empty:
    display(pd.DataFrame(low_corr_vars, columns=['Corrélation']))
else:
    display(Markdown("- Aucune détectée"))

### Variables peu corrélées avec Label (|corr| < 0.3) :

Unnamed: 0,Corrélation


In [174]:
# Liste des features initiales
features = [col for col in df.select_dtypes(include=[np.number]).columns if col != 'Label']

# Importance des caractéristiques (basée sur ton tableau)
feature_importance = pd.DataFrame({
    'Feature': ['Mean', 'Std', 'Max', 'Min', 'Kurtosis', 'Skew', 'Dominant Frequency', 
                'Dominant Amplitude', 'Spectral Energy', 'Envelope Mean', 'Envelope Std', 
                'Envelope Dominant Frequency', 'Envelope Dominant Amplitude'],
    'Importance': [0.000000, 0.120000, 0.100000, 0.129307, 0.150020, 0.016148, 0.220000, 
                   0.007627, 0.100000, 0.006899, 0.150000, 0.000000, 0.000000]
})

# Seuil pour garder les features (ajustable)
threshold = 0
features_to_keep = feature_importance[feature_importance['Importance'] >= threshold]['Feature'].tolist()

# Mettre à jour X avec les features retenues
X = df[features_to_keep]
# Affichage pour vérification
from IPython.display import display, Markdown
display(Markdown("### Features conservées après élimination :"))
display(pd.Series(features_to_keep))
display(Markdown(f"### Nouvelle forme de X : {X.shape}"))

### Features conservées après élimination :

0                            Mean
1                             Std
2                             Max
3                             Min
4                        Kurtosis
5                            Skew
6              Dominant Frequency
7              Dominant Amplitude
8                 Spectral Energy
9                   Envelope Mean
10                   Envelope Std
11    Envelope Dominant Frequency
12    Envelope Dominant Amplitude
dtype: object

### Nouvelle forme de X : (2452, 13)

In [175]:
X.drop(columns='Dominant Frequency', inplace=True)

In [176]:
from IPython.display import display, Markdown
import numpy as np

# Calculer la matrice de corrélation
numeric_cols = X.select_dtypes(include=[np.number]).columns
corr_matrix = X[numeric_cols].corr()

# Seuil pour les corrélations fortes
threshold = 0.7

# Identifier les paires de variables très corrélées
cols_to_drop = set()  # Utiliser un set pour éviter les doublons
for i in range(len(corr_matrix.columns)):
    for j in range(i + 1, len(corr_matrix.columns)):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            # Ajouter la deuxième variable de la paire à la liste des colonnes à supprimer
            col_to_drop = corr_matrix.columns[j]
            cols_to_drop.add(col_to_drop)

# Convertir en liste pour affichage
cols_to_drop = list(cols_to_drop)

# Afficher les colonnes à supprimer
display(Markdown("### Variables très corrélées à supprimer (|corr| > 0.7) :"))
if cols_to_drop:
    display(pd.Series(cols_to_drop))
else:
    display(Markdown("- Aucune variable très corrélée détectée"))

# Supprimer les colonnes de X
X = X.drop(columns=cols_to_drop, errors='ignore')
# Afficher les features restantes
display(Markdown("### Features après suppression des variables corrélées :"))
display(pd.Series(X.columns))
display(Markdown(f"### Nouvelle forme de X : {X.shape}"))

### Variables très corrélées à supprimer (|corr| > 0.7) :

0                            Max
1                            Min
2    Envelope Dominant Amplitude
3                Spectral Energy
4             Dominant Amplitude
5                       Kurtosis
6                   Envelope Std
7                  Envelope Mean
dtype: object

### Features après suppression des variables corrélées :

0                           Mean
1                            Std
2                           Skew
3    Envelope Dominant Frequency
dtype: object

### Nouvelle forme de X : (2452, 4)

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
display(Markdown(f"### Dimensions des ensembles :"))
display(Markdown(f"- Train : {X_train.shape}"))
display(Markdown(f"- Test : {X_test.shape}"))

### Dimensions des ensembles :

- Train : (1961, 4)

- Test : (491, 4)

In [178]:
# Définir la pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Normalisation des features
    ('classifier', RandomForestClassifier(random_state=42))
])

# Entraîner la pipeline
pipeline.fit(X_train, y_train)

display(Markdown("### Pipeline entraînée avec succès"))

### Pipeline entraînée avec succès

In [179]:
import numpy as np
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

def evaluate_model(model, X_train, y_train, X_test, y_test):
    
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    if hasattr(model, "predict_proba"): 
        y_train_proba = model.predict_proba(X_train)[:, 1]
        y_test_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_train_proba = y_train_pred
        y_test_proba = y_test_pred

    # Création des matrices de confusion
    cm_train = confusion_matrix(y_train, y_train_pred)
    cm_test = confusion_matrix(y_test, y_test_pred)
    labels_true = ["True 0", "True 1"]
    labels_pred = ["Prédiction 0", "Prédiction 1"]
    
    
    # Création d'une figure avec sous-graphiques
    fig = make_subplots(rows=1, cols=2, subplot_titles=("Matrice de confusion - Train", "Matrice de confusion - Test"),
                        horizontal_spacing=0.2)

    # Ajout des heatmaps
    fig.add_trace(go.Heatmap(z=cm_train, x=labels_pred, y=labels_true, colorscale="blues", showscale=True), row=1, col=1)
    fig.add_trace(go.Heatmap(z=cm_test, x=labels_pred, y=labels_true, colorscale="blues", showscale=True), row=1, col=2)

    # Ajouter manuellement les annotations
    for i in range(cm_train.shape[0]):
        for j in range(cm_train.shape[1]):
            fig.add_annotation(text=str(cm_train[i, j]), x=labels_pred[j], y=labels_true[i],
                               xref=f"x1", yref=f"y1", showarrow=False, font=dict(color="black"))

    for i in range(cm_test.shape[0]):
        for j in range(cm_test.shape[1]):
            fig.add_annotation(text=str(cm_test[i, j]), x=labels_pred[j], y=labels_true[i],
                               xref=f"x2", yref=f"y2", showarrow=False, font=dict(color="black"))

    fig.update_layout(title_text="Matrices de confusion - Train et Test", height=500, width=1000)
    fig.show()
    # Calcul des courbes ROC
    fpr_train, tpr_train, _ = roc_curve(y_train, y_train_proba)
    fpr_test, tpr_test, _ = roc_curve(y_test, y_test_proba)
    auc_train = auc(fpr_train, tpr_train)
    auc_test = auc(fpr_test, tpr_test)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=fpr_train, y=tpr_train, mode="lines", name=f"Train ROC (AUC={auc_train:.2f})", line=dict(color="blue")))
    fig.add_trace(go.Scatter(x=fpr_test, y=tpr_test, mode="lines", name=f"Test ROC (AUC={auc_test:.2f})", line=dict(color="red")))
    fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode="lines", name="Random", line=dict(dash="dash", color="gray")))
    
    fig.update_layout(title="Courbe ROC", xaxis_title="Taux de faux positifs (FPR)", yaxis_title="Taux de vrais positifs (TPR)")
    fig.show()
    
    
    print("\n=== Rapport de classification (Train) ===")
    print(classification_report(y_train, y_train_pred))
    
    print("\n=== Rapport de classification (Test) ===")
    print(classification_report(y_test, y_test_pred))

In [63]:
pip install nbformat

Note: you may need to restart the kernel to use updated packages.


In [180]:
# Évaluer le modèle
display(Markdown("### Évaluation du modèle"))
evaluate_model(pipeline, X_train, y_train, X_test, y_test)

### Évaluation du modèle


=== Rapport de classification (Train) ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1057
           1       1.00      1.00      1.00       904

    accuracy                           1.00      1961
   macro avg       1.00      1.00      1.00      1961
weighted avg       1.00      1.00      1.00      1961


=== Rapport de classification (Test) ===
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       271
           1       1.00      1.00      1.00       220

    accuracy                           1.00       491
   macro avg       1.00      1.00      1.00       491
weighted avg       1.00      1.00      1.00       491



In [166]:
# Extraire l'importance des features
model = pipeline.named_steps['classifier']
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Importance': model.feature_importances_
}).sort_values('Importance', ascending=False)

display(Markdown("### Importance des caractéristiques :"))
display(feature_importance)

### Importance des caractéristiques :

Unnamed: 0,Feature,Importance
0,Mean,1.0
