In [None]:
import pandas as pd
import numpy as np
import os

def load_downsize_and_save(
    input_path,
    output_name,
    simulation_runs=None,
    max_samples=None,
    chunk_size=50000,
    use_parquet=True
):
    """
    Charge un CSV par chunks, filtre selon les runs et samples,
    optimise la m√©moire et sauvegarde le r√©sultat.

    Args:
        input_path (str): Chemin du CSV source (TEP complet).
        output_path (str): Chemin pour sauvegarder le fichier trait√©.
        simulation_runs (list): Liste des IDs de simulation √† garder (ex: [1, 2, 3]). Si None, garde tout.
        max_samples (int): Nombre max de samples √† garder par run. Si None, garde tout.
        chunk_size (int): Nombre de lignes √† lire √† la fois en RAM.
        use_parquet (bool): Si True, sauvegarde en .parquet (plus rapide/l√©ger), sinon .csv.
    """
    print(f"üîÑ D√©marrage du traitement de {input_path}...")

    processed_chunks = []

    # Lecture par chunk pour √©viter la saturation RAM
    with pd.read_csv(input_path, chunksize=chunk_size) as reader:
        for i, chunk in enumerate(reader):

            # 1. Filtrage par Simulation Run (si sp√©cifi√©)
            # Assumons que la colonne s'appelle 'simulationRun' ou similaire. Ajustez le nom.
            if simulation_runs is not None:
                if 'simulationRun' in chunk.columns:
                    chunk = chunk[chunk['simulationRun'].isin(simulation_runs)]
                else:
                    print("Attention: Colonne 'simulationRun' introuvable.")

            # 2. Filtrage par nombre de samples (si sp√©cifi√©)
            if 'Testing' in input_path:
                if max_samples is not None:
                    if 'sample' in chunk.columns:
                        chunk = chunk[(7 * 20 < chunk['sample']) & (chunk['sample'] <= (max_samples + 7 * 20))]
            else:
                if max_samples is not None:
                    if 'sample' in chunk.columns:
                        chunk = chunk[chunk['sample'] <= max_samples]

            # Si le filtrage a vid√© le chunk, on passe au suivant
            if chunk.empty:
                continue

            # 3. Downcasting des types (Optimisation RAM critique)
            # Convertit les float64 en float32
            cols_float = chunk.select_dtypes(include=['float64']).columns
            chunk[cols_float] = chunk[cols_float].astype('float32')

            # Convertit les int64 en int32 (ou moins si possible)
            cols_int = chunk.select_dtypes(include=['int64']).columns
            chunk[cols_int] = chunk[cols_int].apply(pd.to_numeric, downcast='integer')

            processed_chunks.append(chunk)
            print(f"   Chunk {i+1} trait√©. Taille actuelle: {chunk.shape}")

    # 4. Concat√©nation finale
    if processed_chunks:
        full_df = pd.concat(processed_chunks, ignore_index=True)
        print(f"‚úÖ Concat√©nation termin√©e. Taille finale : {full_df.shape}")

        # 5. Sauvegarde

        output_path = '/home/bapt/code/Monitor-the-Reactor/raw_data/'

        full_df.to_csv(output_path, index=False)
        print(f"üíæ Sauvegard√© en CSV : {output_path}")

        return full_df
    else:
        print("‚ö†Ô∏è Aucune donn√©e n'a √©t√© retenue apr√®s filtrage.")
        return None

In [None]:
load_downsize_and_save(
    '/home/bapt/code/Monitor-the-Reactor/raw_data/TEP_FaultFree_Testing.csv',
    '/home/bapt/code/Monitor-the-Reactor/raw_data/reduced_data_2h_50sim/FF_T.csv',
    simulation_runs=[i for i in range(50)],
    max_samples=40,
    chunk_size=50000,
    use_parquet=False
)

üîÑ D√©marrage du traitement de /home/bapt/code/Monitor-the-Reactor/raw_data/TEP_FaultFree_Testing.csv...
   Chunk 1 trait√©. Taille actuelle: (1960, 55)
‚úÖ Concat√©nation termin√©e. Taille finale : (1960, 55)
üíæ Sauvegard√© en CSV : /home/bapt/code/Monitor-the-Reactor/raw_data/reduced_data_2h_50sim/FF_T.csv


Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,0,1,1,0.25171,3672.399902,4466.299805,9.5122,27.056999,42.473000,2705.600098,...,54.493999,24.527000,59.709999,22.357000,40.148998,40.074001,47.955002,47.299999,42.099998,15.345000
1,0,1,2,0.25234,3642.199951,4568.700195,9.4145,26.999001,42.585999,2705.199951,...,53.269001,24.465000,60.466000,22.413000,39.956001,36.651001,45.037998,47.501999,40.553001,16.063000
2,0,1,3,0.24840,3643.100098,4507.500000,9.2901,26.927000,42.278000,2703.500000,...,54.000000,24.860001,60.641998,22.198999,40.074001,41.868000,44.553001,47.479000,41.341000,20.452000
3,0,1,4,0.25153,3628.300049,4519.299805,9.3347,26.999001,42.330002,2703.899902,...,53.860001,24.552999,61.908001,21.981001,40.140999,40.066002,48.048000,47.439999,40.779999,17.122999
4,0,1,5,0.21763,3655.800049,4571.000000,9.3087,26.900999,42.402000,2707.699951,...,53.306999,21.775000,61.890999,22.412001,37.695999,38.294998,44.678001,47.529999,41.089001,18.681000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1955,0,49,36,0.25017,3684.500000,4527.899902,9.4333,26.632999,42.207001,2691.399902,...,53.386002,24.799999,59.771999,21.240000,39.075001,36.140999,47.709999,48.188000,40.849998,18.090000
1956,0,49,37,0.30064,3626.100098,4477.600098,9.3928,26.829000,42.175999,2690.000000,...,52.516998,29.372999,60.070999,21.171000,40.748001,39.138000,47.459999,48.201000,40.754002,14.893000
1957,0,49,38,0.30029,3637.899902,4398.700195,9.4419,27.037001,42.551998,2693.500000,...,53.460999,29.408001,60.910000,21.773001,40.724998,39.676998,49.346001,48.131001,40.817001,16.514999
1958,0,49,39,0.27630,3658.000000,4398.600098,9.4557,26.642000,42.476002,2696.000000,...,53.957001,26.914000,59.348999,21.847000,40.542999,34.875000,44.130001,48.147999,40.793999,16.493000


In [5]:
import pandas as pd
import numpy as np
import os
import glob

def batch_process_tep_data(
    input_dir,
    output_name,
    simulation_runs=None,
    max_samples=None,
    chunk_size=50000,
    use_parquet=False
):
    """
    Charge 4 CSVs depuis un dossier, filtre, downsize et sauvegarde dans un dossier structur√©.

    Args:
        input_dir (str): Dossier contenant les 4 fichiers CSV bruts.
        output_name (str): Pr√©fixe pour les fichiers de sortie.
        simulation_runs (list): IDs des runs √† garder.
        max_samples (int): Nb de samples max. Sert aussi au nom du dossier (divis√© par 20).
    """

    # --- 1. Cr√©ation du nom de dossier dynamique ---
    # Gestion du nom pour les runs
    if simulation_runs:
        # Si la liste est courte, on les met tous, sinon on met "custom_runs"
        runs_str = "-".join(map(str, simulation_runs)) if len(simulation_runs) < 5 else "custom_runs"
    else:
        runs_str = "all_runs"

    # Gestion du nom pour les heures (samples / 20)
    if max_samples:
        hours_str = f"{int(max_samples / 20)}h"
    else:
        hours_str = "full_history"

    # Nom final du dossier
    output_folder = f"reduced_data_{runs_str}_{hours_str}"

    # Cr√©ation effective du dossier
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
        print(f"üìÇ Dossier cr√©√© : {output_folder}")
    else:
        print(f"üìÇ Utilisation du dossier existant : {output_folder}")

    # --- 2. R√©cup√©ration des fichiers CSV ---
    csv_files = glob.glob(os.path.join(input_dir, "*.csv"))

    if not csv_files:
        print(f"‚ùå Aucun fichier CSV trouv√© dans {input_dir}")
        return

    print(f"üîé {len(csv_files)} fichiers trouv√©s √† traiter.")

    # --- 3. Boucle de traitement sur chaque fichier ---
    for file_path in csv_files:
        filename = os.path.basename(file_path).split('.')[0]
        print(f"\n--- Traitement de : {filename} ---")

        processed_chunks = []

        # Lecture par chunk
        with pd.read_csv(file_path, chunksize=chunk_size) as reader:
            for i, chunk in enumerate(reader):

                # Filtrage Run
                if simulation_runs is not None and 'simulationRun' in chunk.columns:
                    chunk = chunk[chunk['simulationRun'].isin(simulation_runs)]

                # Filtrage Samples
                if 'Testing' in input_dir:
                    if max_samples is not None:
                        if 'sample' in chunk.columns:
                            chunk = chunk[(7 * 20 < chunk['sample']) & (chunk['sample'] <= (max_samples + 7 * 20))]
                else:
                    if max_samples is not None and 'sample' in chunk.columns:
                        chunk = chunk[chunk['sample'] <= max_samples]

                if chunk.empty:
                    continue

                # Downcasting (Optimisation RAM)
                cols_float = chunk.select_dtypes(include=['float64']).columns
                chunk[cols_float] = chunk[cols_float].astype('float32')

                cols_int = chunk.select_dtypes(include=['int64']).columns
                chunk[cols_int] = chunk[cols_int].apply(pd.to_numeric, downcast='integer')

                processed_chunks.append(chunk)

        # --- 4. Sauvegarde ---
        if processed_chunks:
            full_df = pd.concat(processed_chunks, ignore_index=True)

            # Construction du nom de fichier de sortie
            # Ex: reduced_data_.../output_name_TEP_faulty.parquet
            ext = ".parquet" if use_parquet else ".csv"
            save_path = os.path.join(output_folder, f"{output_name}_{filename}{ext}")

            if use_parquet:
                full_df.to_parquet(save_path, index=False)
            else:
                full_df.to_csv(save_path, index=False)

            print(f"‚úÖ Sauvegard√© : {save_path} (Shape: {full_df.shape})")

            # Nettoyage m√©moire explicite
            del full_df
            del processed_chunks
        else:
            print(f"‚ö†Ô∏è Fichier {filename} ignor√© (vide apr√®s filtrage).")

    print("\nüöÄ Traitement global termin√©.")

In [6]:
batch_process_tep_data(
    '/home/bapt/code/Monitor-the-Reactor/Data/raw_data',
    "sampled_data",
    simulation_runs=[i for i in range(20)],
    max_samples=10,
    chunk_size=5000,
    use_parquet=False
)

üìÇ Utilisation du dossier existant : reduced_data_custom_runs_0h
üîé 4 fichiers trouv√©s √† traiter.

--- Traitement de : TEP_FaultFree_Training ---
‚úÖ Sauvegard√© : reduced_data_custom_runs_0h/sampled_data_TEP_FaultFree_Training.csv (Shape: (190, 55))

--- Traitement de : TEP_Faulty_Testing ---
‚úÖ Sauvegard√© : reduced_data_custom_runs_0h/sampled_data_TEP_Faulty_Testing.csv (Shape: (3800, 55))

--- Traitement de : TEP_FaultFree_Testing ---
‚úÖ Sauvegard√© : reduced_data_custom_runs_0h/sampled_data_TEP_FaultFree_Testing.csv (Shape: (190, 55))

--- Traitement de : TEP_Faulty_Training ---
‚úÖ Sauvegard√© : reduced_data_custom_runs_0h/sampled_data_TEP_Faulty_Training.csv (Shape: (3800, 55))

üöÄ Traitement global termin√©.


In [3]:
import os
import pandas as pd

from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent

def load_data_fault_free_test():
    path = os.path.join(BASE_DIR, '..', 'raw_data', 'TEP_FaultFree_Testing.csv')
    return pd.read_csv(path)

def load_data_fault_free_train():
    path = os.path.join(BASE_DIR, '..', 'raw_data', 'TEP_FaultFree_Training.csv')
    return pd.read_csv(path)

def load_data_faulty_test():
    path = os.path.join(BASE_DIR, '..', 'raw_data', 'TEP_Faulty_Testing.csv')
    return pd.read_csv(path)

def load_data_faulty_train():
    path = os.path.join(BASE_DIR, '..', 'raw_data', 'TEP_Faulty_Training.csv')
    return pd.read_csv(path)


print(os.path.join(BASE_DIR, '..', 'raw_data', 'TEP_Faulty_Training.csv'))

NameError: name '__file__' is not defined