In [None]:
import pandas as pd
import os
from typing import Iterator, Generator, List, Dict

# --- Constantes ---
CSV_FILE_PATH = None
CHUNK_SIZE = 100000  # D√©finir une taille de morceau (par exemple, 100 000 lignes)

def process_csv_by_chunks(path: str, chunk_size: int) -> List[pd.DataFrame] | None:
    """
    Charge un fichier CSV par morceaux, traite chaque morceau,
    et retourne une liste des DataFrames trait√©s.

    Args:
        path (str): Le chemin vers le fichier CSV.
        chunk_size (int): Le nombre de lignes √† lire √† la fois.

    Returns:
        List[pd.DataFrame] | None: Une liste des DataFrames trait√©s, ou None en cas d'erreur.
    """

    print(f"Tentative de chargement du fichier : {os.path.abspath(path)}")
    print(f"Chargement par morceaux de taille : {chunk_size} lignes.")

    if not os.path.exists(path):
        print(f"Erreur: Le fichier '{path}' est introuvable. V√©rifiez le chemin d'acc√®s.")
        return None

    processed_chunks = []
    chunk_index = 0

    try:
        # Cr√©er un it√©rateur (TextFileReader) au lieu d'un DataFrame unique
        csv_iterator = pd.read_csv(path, chunksize=chunk_size)

        # Parcourir les morceaux g√©n√©r√©s par l'it√©rateur
        for chunk in csv_iterator:
            chunk_index += 1
            print(f"Traitement du morceau #{chunk_index} (taille: {len(chunk)} lignes)...")

            # üí° --- Zone de Traitement des Donn√©es --- üí°
            # Ici, vous pouvez appliquer des op√©rations qui r√©duisent la taille du morceau,
            # comme le filtrage, l'agr√©gation ou le calcul de statistiques.

            # Exemple : Calculer la moyenne de toutes les colonnes et stocker
            # stats_df = chunk.mean().to_frame().T
            # processed_chunks.append(stats_df)

            # Exemple : Filtrer pour garder uniquement les lignes o√π 'col_A' > 10
            # filtered_chunk = chunk[chunk['col_A'] > 10]
            # processed_chunks.append(filtered_chunk)

            # --- Fin de la Zone de Traitement ---

            # Dans cet exemple, nous stockons le morceau complet filtr√©
            # Si vous avez 500Mo, vous DEVEZ faire un traitement pour r√©duire le morceau avant de l'ajouter
            # √† 'processed_chunks', sinon vous resaturerez votre RAM.
            processed_chunks.append(chunk)


        print(f"\nChargement et traitement termin√©s. {chunk_index} morceaux trait√©s.")

        # ‚ö†Ô∏è ATTENTION : La ligne suivante va CONSOLIDER TOUS les morceaux.
        # Si vous n'avez pas r√©duit la taille des morceaux, vous risquez une saturation RAM.
        # Si vous n'avez besoin que de statistiques, vous pouvez retourner processed_chunks directement.
        final_dataframe = pd.concat(processed_chunks, ignore_index=True)
        print(f"Taille du DataFrame final: {len(final_dataframe)} lignes.")
        return final_dataframe

    except Exception as e:
        print(f"Une erreur s'est produite lors du traitement du fichier CSV : {e}")
        return None

In [None]:
# FF_test = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Testing.csv",
#                                 chunk_size=CHUNK_SIZE)

In [None]:
# FF_train = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Training.csv",
#                                 chunk_size=CHUNK_SIZE)

In [None]:
# F_test = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Testing.csv",
#                                 chunk_size=CHUNK_SIZE)

In [None]:
# F_train = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Training.csv",
#                                 chunk_size=CHUNK_SIZE)

In [None]:
FF_train.columns

In [None]:
def reduce_sim(df, nb_sim):
  '''
  Permet de reduire le nombre de lignes de simulations
  '''
  print('\n Shape du df before : ')
  print(df.shape)
  df = df[df['simulationRun'] <= nb_sim]

  print('\n Shape du df apr√®s : ')
  print(df.shape)

  return df

In [None]:
FF_train_20sim = reduce_sim(FF_train, 20)
F_train_20sim = reduce_sim(F_train, 20)
FF_test_20sim = reduce_sim(FF_test, 20)
F_test_20sim = reduce_sim(F_test, 20)


In [None]:
faulty_train = F_train_20sim
faulty_test = F_test_20sim
fault_free_train = FF_train_20sim
fault_free_test = FF_test_20sim


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from tensorflow import keras
from tensorflow.keras import layers

# Vos donn√©es sont d√©j√† charg√©es
# fault_free_train, faulty_train, fault_free_test, faulty_test

print("="*60)
print("PR√âPARATION DES DONN√âES")
print("="*60)



# ====================================
# 1. CONVERSION EN 3D
# ====================================

def dataframe_to_3d(df, has_fault=True):
    """Convertit DataFrame en 3D"""
    cols_to_drop = ['faultNumber', 'simulationRun', 'sample']
    feature_cols = [col for col in df.columns if col not in cols_to_drop]

    df_sorted = df.sort_values(['simulationRun', 'sample']).reset_index(drop=True)
    simulation_ids = df_sorted['simulationRun'].unique()

    simulations = []
    labels = []

    for sim_id in simulation_ids:
        sim_data = df_sorted[df_sorted['simulationRun'] == sim_id]
        features = sim_data[feature_cols].values
        simulations.append(features)
        labels.append(sim_data['faultNumber'].iloc[0] if has_fault else 0)

    return simulations, np.array(labels, dtype=int)

def pad_or_truncate(simulations, target_length):
    """Harmonise les longueurs"""
    result = []
    for sim in simulations:
        if len(sim) < target_length:
            padding = np.repeat([sim[-1]], target_length - len(sim), axis=0)
            result.append(np.vstack([sim, padding]))
        elif len(sim) > target_length:
            result.append(sim[:target_length])
        else:
            result.append(sim)
    return np.array(result)

# Convertir en 3D
print("Conversion en 3D...")
X_ff_tr_list, y_ff_tr = dataframe_to_3d(fault_free_train, False)
X_f_tr_list, y_f_tr = dataframe_to_3d(faulty_train, True)
X_ff_te_list, y_ff_te = dataframe_to_3d(fault_free_test, False)
X_f_te_list, y_f_te = dataframe_to_3d(faulty_test, True)

# Harmoniser les longueurs
target_length = max(max(len(s) for s in X_f_tr_list), max(len(s) for s in X_f_te_list))
print(f"Longueur cible: {target_length}")

X_ff_tr = pad_or_truncate(X_ff_tr_list, target_length)
X_f_tr = pad_or_truncate(X_f_tr_list, target_length)
X_ff_te = pad_or_truncate(X_ff_te_list, target_length)
X_f_te = pad_or_truncate(X_f_te_list, target_length)

# Combiner
X_train_full = np.concatenate([X_ff_tr, X_f_tr], axis=0)
y_train_full = np.concatenate([y_ff_tr, y_f_tr])
X_test_final = np.concatenate([X_ff_te, X_f_te], axis=0)
y_test_final = np.concatenate([y_ff_te, y_f_te])

# M√©langer
X_train_full, y_train_full = shuffle(X_train_full, y_train_full, random_state=42)
X_test_final, y_test_final = shuffle(X_test_final, y_test_final, random_state=42)

print(f"\nX_train_full: {X_train_full.shape}")
print(f"X_test_final: {X_test_final.shape}")
print(f"Distribution train: {np.bincount(y_train_full)}")
print(f"Distribution test: {np.bincount(y_test_final)}")

# ====================================
# 2. SPLIT ET NORMALISATION
# ====================================

X_train, X_val, y_train, y_val = train_test_split(
    X_train_full, y_train_full, test_size=0.2, stratify=y_train_full, random_state=42
)

n_train, n_time, n_feat = X_train.shape
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train.reshape(-1, n_feat)).reshape(n_train, n_time, n_feat)
X_val = scaler.transform(X_val.reshape(-1, n_feat)).reshape(X_val.shape[0], n_time, n_feat)
X_test_final = scaler.transform(X_test_final.reshape(-1, n_feat)).reshape(X_test_final.shape[0], X_test_final.shape[1], n_feat)

print(f"\nTrain: {X_train.shape}, Val: {X_val.shape}, Test: {X_test_final.shape}")

# ====================================
# 3. MOD√àLE
# ====================================

print("\n" + "="*60)
print("MOD√àLE LSTM")
print("="*60)

model = keras.Sequential([
    layers.Input(shape=(n_time, n_feat)),
    layers.LSTM(128, return_sequences=True),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.LSTM(64),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(64, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(21, activation='softmax')
])

model.compile(
    optimizer=keras.optimizers.Adam(0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

# ====================================
# 4. ENTRA√éNEMENT
# ====================================

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[
        keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
        keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=5)
    ],
    verbose=1
)

# ====================================
# 5. √âVALUATION - CORRIG√âE
# ====================================

test_loss, test_acc = model.evaluate(X_test_final, y_test_final, verbose=0)
print(f"\n{'='*60}")
print(f"TEST ACCURACY: {test_acc:.4f}")
print(f"{'='*60}")

y_pred = model.predict(X_test_final, verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classes pr√©sentes
unique_labels = sorted(np.unique(y_test_final))
class_names = ['Normal' if l==0 else f'Panne_{l}' for l in unique_labels]

from sklearn.metrics import classification_report
print("\n" + classification_report(
    y_test_final, y_pred_classes,
    labels=unique_labels,
    target_names=class_names,
    zero_division=0
))

# Sauvegarder le mod√®le
model.save('lstm_baseline_model.keras')
print("\n‚úì Mod√®le sauvegard√© : lstm_baseline_model.keras")

PR√âPARATION DES DONN√âES
Conversion en 3D...
Longueur cible: 19200

X_train_full: (40, 19200, 52)
X_test_final: (40, 19200, 52)
Distribution train: [20 20]
Distribution test: [20 20]

Train: (32, 19200, 52), Val: (8, 19200, 52), Test: (40, 19200, 52)

MOD√àLE LSTM
Epoch 1/50
