In [58]:
# Code to test and train a ROCKET model on our data using stratified kfold cross-validation
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sktime.classification.kernel_based import RocketClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

CONTROL = 'TERBINAFINE- (control)'
TREATED = 'TERBINAFINE+'
PROCESSED_DIR = 'data/processed'

In [59]:
def load_data(data_dir):
    """ We load the data with padding to the longest time series to make them all the same length """
    X, y = [], []
    for treatment in [CONTROL, TREATED]:
        treatment_dir = os.path.join(data_dir, treatment)
        for file_name in os.listdir(treatment_dir):
            if file_name.endswith('.csv'):
                file_path = os.path.join(treatment_dir, file_name)
                df = pd.read_csv(file_path)
                time_series = df[['X', 'Y', 'Speed']].values
                X.append(time_series)
                y.append(treatment)
    # Pad sequences to the same length
    max_length = max(len(ts) for ts in X)
    X_padded = []
    for ts in X:
        padding_length = max_length - len(ts)
        if padding_length > 0:
            padding = np.zeros((padding_length, ts.shape[1]))
            ts_padded = np.vstack([ts, padding])
        else:
            ts_padded = ts
        X_padded.append(ts_padded)
    X_array = np.array(X_padded)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return X_array, y_encoded

X, y = load_data(PROCESSED_DIR)

In [60]:
# Transpose for ROCKET input shape (n_instances, n_channels, n_timepoints)
X_transposed = X.transpose(0, 2, 1)

In [61]:
class PanelStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        n_instances, n_channels, n_timepoints = X.shape
        X_reshaped = X.reshape(n_instances, -1)
        self.scaler.fit(X_reshaped)
        return self

    def transform(self, X):
        n_instances, n_channels, n_timepoints = X.shape
        X_reshaped = X.reshape(n_instances, -1)
        X_scaled = self.scaler.transform(X_reshaped)
        return X_scaled.reshape(n_instances, n_channels, n_timepoints)

In [63]:
USE_SCALER = True  # Set to False to disable StandardScaler

n_splits = 2
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

accuracies, precisions, recalls, f1s = [], [], [], []

for fold, (train_idx, test_idx) in enumerate(skf.split(X_transposed, y)):
    print(f"Fold {fold+1}/{n_splits}")
    X_train, X_test = X_transposed[train_idx], X_transposed[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    classifier = RocketClassifier(
        num_kernels=10000, 
        random_state=42,
        rocket_transform="minirocket" 
    )
    if USE_SCALER:
        pipeline = make_pipeline(PanelStandardScaler(), classifier)
    else:
        pipeline = make_pipeline(classifier)

    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='weighted')
    rec = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')

    accuracies.append(acc)
    precisions.append(prec)
    recalls.append(rec)
    f1s.append(f1)

    print(classification_report(y_test, y_pred, target_names=[CONTROL, TREATED]))
    print(confusion_matrix(y_test, y_pred))
    print("-"*40)

print("\nAverage metrics over all folds:")
print(f"Accuracy: {np.mean(accuracies):.4f}")
print(f"Precision: {np.mean(precisions):.4f}")
print(f"Recall: {np.mean(recalls):.4f}")
print(f"F1-score: {np.mean(f1s):.4f}")

Fold 1/2


  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)


                        precision    recall  f1-score   support

TERBINAFINE- (control)       0.68      0.81      0.74        26
          TERBINAFINE+       0.76      0.62      0.68        26

              accuracy                           0.71        52
             macro avg       0.72      0.71      0.71        52
          weighted avg       0.72      0.71      0.71        52

[[21  5]
 [10 16]]
----------------------------------------
Fold 2/2


  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)
  self._deprecate_tag_warn(collected_tags)


                        precision    recall  f1-score   support

TERBINAFINE- (control)       0.71      0.85      0.77        26
          TERBINAFINE+       0.81      0.65      0.72        26

              accuracy                           0.75        52
             macro avg       0.76      0.75      0.75        52
          weighted avg       0.76      0.75      0.75        52

[[22  4]
 [ 9 17]]
----------------------------------------

Average metrics over all folds:
Accuracy: 0.7308
Precision: 0.7396
Recall: 0.7308
F1-score: 0.7283


In [64]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
# Nouveaux imports pour la pipeline ROCKET en deux étapes et le classifieur
from sktime.transformations.panel.rocket import MiniRocketMultivariate 
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

CONTROL = 'TERBINAFINE- (control)'
TREATED = 'TERBINAFINE+'
PROCESSED_DIR = 'data/processed'

# --- 1. CLASSE DE PADDING ET CHARGEMENT (Identique, nécessaire) ---
def load_data(data_dir):
    """ We load the data with padding to the longest time series to make them all the same length """
    X, y = [], []
    for treatment in [CONTROL, TREATED]:
        treatment_dir = os.path.join(data_dir, treatment)
        # Vérifiez l'existence du répertoire ici pour éviter l'erreur
        if not os.path.exists(treatment_dir):
            print(f"ATTENTION: Le répertoire {treatment_dir} n'existe pas. Veuillez vérifier le chemin.")
            continue
            
        for file_name in os.listdir(treatment_dir):
            if file_name.endswith('.csv'):
                file_path = os.path.join(treatment_dir, file_name)
                df = pd.read_csv(file_path)
                time_series = df[['X', 'Y', 'Speed']].values
                X.append(time_series)
                y.append(treatment)
    
    if not X:
        print("Aucune donnée chargée.")
        return np.array([]), np.array([])
        
    # Pad sequences to the same length
    max_length = max(len(ts) for ts in X)
    X_padded = []
    for ts in X:
        padding_length = max_length - len(ts)
        if padding_length > 0:
            padding = np.zeros((padding_length, ts.shape[1]), dtype=ts.dtype)
            ts_padded = np.vstack([ts, padding])
        else:
            ts_padded = ts
        X_padded.append(ts_padded)
        
    X_array = np.array(X_padded)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return X_array, y_encoded

# --- 2. CLASSE DE NORMALISATION (Utilisée si besoin) ---
class PanelStandardScaler(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        n_instances, n_channels, n_timepoints = X.shape
        # Aplatit pour ajuster un seul scaler sur toutes les frames/canaux
        X_reshaped = X.reshape(n_instances * n_channels, n_timepoints).T
        self.scaler.fit(X_reshaped)
        return self

    def transform(self, X):
        n_instances, n_channels, n_timepoints = X.shape
        X_reshaped = X.reshape(n_instances * n_channels, n_timepoints).T
        X_scaled = self.scaler.transform(X_reshaped)
        # Réorganise au format (n_instances, n_channels, n_timepoints)
        return X_scaled.T.reshape(n_instances, n_channels, n_timepoints)

# --- 3. CHARGEMENT ET PRÉPARATION GLOBALE ---
X_initial, y = load_data(PROCESSED_DIR)
# Transpose for ROCKET input shape (n_instances, n_channels, n_timepoints)
X = X_initial.transpose(0, 2, 1)

# --- 4. PARAMÈTRES ET HYPERPARAMÈTRES ---
USE_SCALER = False  # Maintenu à False si la normalisation détruit le signal
THRESHOLD = 0.40    # NOUVEAU SEUIL pour favoriser le Rappel (Descendu de 0.50 à 0.40)
N_SPLITS = 5        # Augmenté à 5 pour une meilleure validation
NUM_KERNELS = 1000 # Augmenté pour de meilleures performances (si le temps le permet)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

# --- 5. BOUGLE D'ENTRAÎNEMENT ET D'ÉVALUATION ---

# Liste pour collecter toutes les prédictions finales (pour l'analyse globale)
y_true_total, y_pred_total_tuned = np.array([]), np.array([])

for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
    print(f"\n--- Pli {fold+1}/{N_SPLITS} ---")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # --- DEFINITION DU MODELE ---
    rocket_transformer = MiniRocketMultivariate(
        num_kernels=NUM_KERNELS, 
        random_state=42
    )
    
    # NOUVEAU CLASSIFIEUR : Régression Logistique avec class_weight='balanced'
    # 'balanced' donne plus d'importance à la classe la moins bien prédite (TERBINAFINE+)
    classifier = LogisticRegression(
        solver='liblinear', 
        class_weight='balanced', 
        random_state=42
    )
    
    # Construction de la pipeline
    steps = []
    if USE_SCALER:
        steps.append(PanelStandardScaler())
    steps.extend([rocket_transformer, classifier])
    
    pipeline = make_pipeline(*steps)

    # Entraînement
    pipeline.fit(X_train, y_train)

    # --- PRÉDICTION et AJUSTEMENT du SEUIL (Threshold Tuning) ---
    
    # 1. Obtenir les probabilités de la classe 1 (TERBINAFINE+)
    y_proba = pipeline.predict_proba(X_test)[:, 1] 
    
    # 2. Appliquer le seuil ajusté (favorise le rappel)
    y_pred_tuned = (y_proba > THRESHOLD).astype(int)

    # --- Évaluation du Pli ---
    
    acc = accuracy_score(y_test, y_pred_tuned)
    prec = precision_score(y_test, y_pred_tuned, average='weighted', zero_division=0)
    rec = recall_score(y_test, y_pred_tuned, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred_tuned, average='weighted', zero_division=0)
    
    print(f"Précision (Seuil {THRESHOLD}): {acc:.4f}")
    
    # Collection des résultats pour le rapport final cumulé
    y_true_total = np.concatenate([y_true_total, y_test])
    y_pred_total_tuned = np.concatenate([y_pred_total_tuned, y_pred_tuned])

# --- 6. RÉSULTATS CUMULÉS ---

print("\n" + "="*50)
print(f"RÉSULTATS CUMULÉS (Seuil ajusté à {THRESHOLD})")
print("="*50)

print("\n--- Rapport de Classification Final ---")
print(classification_report(y_true_total, y_pred_total_tuned, target_names=[CONTROL, TREATED], zero_division=0))
print("\n--- Matrice de Confusion Finale ---")
print(confusion_matrix(y_true_total, y_pred_total_tuned))


--- Pli 1/5 ---
Précision (Seuil 0.4): 0.6667

--- Pli 2/5 ---
Précision (Seuil 0.4): 0.6667

--- Pli 2/5 ---
Précision (Seuil 0.4): 0.8571

--- Pli 3/5 ---
Précision (Seuil 0.4): 0.8571

--- Pli 3/5 ---
Précision (Seuil 0.4): 0.7619

--- Pli 4/5 ---
Précision (Seuil 0.4): 0.7619

--- Pli 4/5 ---
Précision (Seuil 0.4): 0.8571

--- Pli 5/5 ---
Précision (Seuil 0.4): 0.8571

--- Pli 5/5 ---
Précision (Seuil 0.4): 0.7500

RÉSULTATS CUMULÉS (Seuil ajusté à 0.4)

--- Rapport de Classification Final ---
                        precision    recall  f1-score   support

TERBINAFINE- (control)       0.87      0.65      0.75        52
          TERBINAFINE+       0.72      0.90      0.80        52

              accuracy                           0.78       104
             macro avg       0.80      0.78      0.78       104
          weighted avg       0.80      0.78      0.78       104


--- Matrice de Confusion Finale ---
[[34 18]
 [ 5 47]]
Précision (Seuil 0.4): 0.7500

RÉSULTATS CUMULÉS (Seu