In [6]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: 

In [8]:
import pandas as pd
import os
from typing import Iterator, Generator, List, Dict

# --- Constantes ---
CSV_FILE_PATH = None
CHUNK_SIZE = 100000  # D√©finir une taille de morceau (par exemple, 100 000 lignes)

def process_csv_by_chunks(path: str, chunk_size: int) -> List[pd.DataFrame] | None:
    """
    Charge un fichier CSV par morceaux, traite chaque morceau,
    et retourne une liste des DataFrames trait√©s.

    Args:
        path (str): Le chemin vers le fichier CSV.
        chunk_size (int): Le nombre de lignes √† lire √† la fois.

    Returns:
        List[pd.DataFrame] | None: Une liste des DataFrames trait√©s, ou None en cas d'erreur.
    """

    print(f"Tentative de chargement du fichier : {os.path.abspath(path)}")
    print(f"Chargement par morceaux de taille : {chunk_size} lignes.")

    if not os.path.exists(path):
        print(f"Erreur: Le fichier '{path}' est introuvable. V√©rifiez le chemin d'acc√®s.")
        return None

    processed_chunks = []
    chunk_index = 0

    try:
        # Cr√©er un it√©rateur (TextFileReader) au lieu d'un DataFrame unique
        csv_iterator = pd.read_csv(path, chunksize=chunk_size)

        # Parcourir les morceaux g√©n√©r√©s par l'it√©rateur
        for chunk in csv_iterator:
            chunk_index += 1
            print(f"Traitement du morceau #{chunk_index} (taille: {len(chunk)} lignes)...")

            # üí° --- Zone de Traitement des Donn√©es --- üí°
            # Ici, vous pouvez appliquer des op√©rations qui r√©duisent la taille du morceau,
            # comme le filtrage, l'agr√©gation ou le calcul de statistiques.

            # Exemple : Calculer la moyenne de toutes les colonnes et stocker
            # stats_df = chunk.mean().to_frame().T
            # processed_chunks.append(stats_df)

            # Exemple : Filtrer pour garder uniquement les lignes o√π 'col_A' > 10
            # filtered_chunk = chunk[chunk['col_A'] > 10]
            # processed_chunks.append(filtered_chunk)

            # --- Fin de la Zone de Traitement ---

            # Dans cet exemple, nous stockons le morceau complet filtr√©
            # Si vous avez 500Mo, vous DEVEZ faire un traitement pour r√©duire le morceau avant de l'ajouter
            # √† 'processed_chunks', sinon vous resaturerez votre RAM.
            processed_chunks.append(chunk)


        print(f"\nChargement et traitement termin√©s. {chunk_index} morceaux trait√©s.")

        # ‚ö†Ô∏è ATTENTION : La ligne suivante va CONSOLIDER TOUS les morceaux.
        # Si vous n'avez pas r√©duit la taille des morceaux, vous risquez une saturation RAM.
        # Si vous n'avez besoin que de statistiques, vous pouvez retourner processed_chunks directement.
        final_dataframe = pd.concat(processed_chunks, ignore_index=True)
        print(f"Taille du DataFrame final: {len(final_dataframe)} lignes.")
        return final_dataframe

    except Exception as e:
        print(f"Une erreur s'est produite lors du traitement du fichier CSV : {e}")
        return None

In [11]:
from google.colab import drive
drive.mount('/content')

KeyboardInterrupt: 

In [None]:
FF_test = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Testing.csv",
                                chunk_size=CHUNK_SIZE)

Tentative de chargement du fichier : /content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Testing.csv
Chargement par morceaux de taille : 100000 lignes.
Erreur: Le fichier '/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Testing.csv' est introuvable. V√©rifiez le chemin d'acc√®s.


In [16]:
FF_test = process_csv_by_chunks('../raw_data/TEP_FaultFree_Testing.csv',
                                chunk_size=CHUNK_SIZE)

Tentative de chargement du fichier : /raw_data/TEP_FaultFree_Testing.csv
Chargement par morceaux de taille : 100000 lignes.
Erreur: Le fichier '../raw_data/TEP_FaultFree_Testing.csv' est introuvable. V√©rifiez le chemin d'acc√®s.


In [4]:
FF_test.head()

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,0,1,1,0.25171,3672.4,4466.3,9.5122,27.057,42.473,2705.6,...,54.494,24.527,59.71,22.357,40.149,40.074,47.955,47.3,42.1,15.345
1,0,1,2,0.25234,3642.2,4568.7,9.4145,26.999,42.586,2705.2,...,53.269,24.465,60.466,22.413,39.956,36.651,45.038,47.502,40.553,16.063
2,0,1,3,0.2484,3643.1,4507.5,9.2901,26.927,42.278,2703.5,...,54.0,24.86,60.642,22.199,40.074,41.868,44.553,47.479,41.341,20.452
3,0,1,4,0.25153,3628.3,4519.3,9.3347,26.999,42.33,2703.9,...,53.86,24.553,61.908,21.981,40.141,40.066,48.048,47.44,40.78,17.123
4,0,1,5,0.21763,3655.8,4571.0,9.3087,26.901,42.402,2707.7,...,53.307,21.775,61.891,22.412,37.696,38.295,44.678,47.53,41.089,18.681


In [None]:
df = FF_test
two_hour_mask = (df.loc[:,'sample']>=140) & (df.loc[:,'sample']<180)
FF_test_2h = df[two_hour_mask]
FF_test_2h

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
139,0,1,140,0.21894,3708.1,4520.0,9.3634,26.851,42.553,2695.8,...,53.936,21.322,60.641,21.214,40.885,39.901,46.303,43.228,41.162,15.862
140,0,1,141,0.32847,3669.8,4511.5,9.3173,26.606,42.343,2696.9,...,53.829,31.902,59.924,20.736,42.104,39.592,47.771,43.489,40.798,18.671
141,0,1,142,0.32584,3659.4,4510.5,9.3382,26.780,42.624,2696.9,...,53.841,32.161,61.024,20.768,42.064,33.923,50.486,43.599,40.615,20.176
142,0,1,143,0.26900,3701.0,4485.9,9.4336,26.409,42.178,2696.0,...,54.005,26.595,59.277,20.931,37.724,40.260,45.132,43.496,40.837,17.731
143,0,1,144,0.26980,3699.8,4477.6,9.4473,27.031,42.393,2696.4,...,54.103,26.516,61.277,21.092,37.692,40.579,45.270,43.464,41.090,19.544
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
479214,0,500,175,0.28574,3649.2,4507.5,9.2500,26.800,42.229,2699.7,...,52.905,27.889,62.153,21.974,39.768,39.454,50.399,45.322,40.874,17.208
479215,0,500,176,0.28362,3626.2,4487.7,9.3004,26.952,42.298,2703.6,...,53.143,28.098,63.467,22.243,39.851,37.979,47.343,45.193,40.945,17.228
479216,0,500,177,0.23878,3673.4,4454.6,9.3937,26.950,42.553,2703.3,...,52.870,23.544,60.977,21.957,39.296,42.488,45.808,45.437,41.373,17.186
479217,0,500,178,0.23929,3654.9,4456.2,9.3055,26.710,42.364,2704.7,...,52.850,23.494,61.623,22.139,39.197,40.903,46.753,45.492,41.502,16.852


In [8]:
FF_train = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Training.csv",
                                chunk_size=CHUNK_SIZE)

Tentative de chargement du fichier : /content/drive/MyDrive/Colab Notebooks/raw_data/TEP_FaultFree_Training.csv
Chargement par morceaux de taille : 100000 lignes.
Traitement du morceau #1 (taille: 100000 lignes)...
Traitement du morceau #2 (taille: 100000 lignes)...
Traitement du morceau #3 (taille: 50000 lignes)...

Chargement et traitement termin√©s. 3 morceaux trait√©s.
Taille du DataFrame final: 250000 lignes.


In [9]:
df = FF_train
two_hour_mask = df.loc[:,'sample']<40
FF_train_2h = df[two_hour_mask]
FF_train_2h

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,0,1,1,0.25038,3674.0,4529.0,9.2320,26.889,42.402,2704.3,...,53.744,24.657,62.544,22.137,39.935,42.323,47.757,47.510,41.258,18.447
1,0,1,2,0.25109,3659.4,4556.6,9.4264,26.721,42.576,2705.0,...,53.414,24.588,59.259,22.084,40.176,38.554,43.692,47.427,41.359,17.194
2,0,1,3,0.25038,3660.3,4477.8,9.4426,26.875,42.070,2706.2,...,54.357,24.666,61.275,22.380,40.244,38.990,46.699,47.468,41.199,20.530
3,0,1,4,0.24977,3661.3,4512.1,9.4776,26.758,42.063,2707.2,...,53.946,24.725,59.856,22.277,40.257,38.072,47.541,47.658,41.643,18.089
4,0,1,5,0.29405,3679.0,4497.0,9.3381,26.889,42.650,2705.1,...,53.658,28.797,60.717,21.947,39.144,41.955,47.645,47.346,41.507,18.461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
249534,0,500,35,0.25379,3673.3,4490.1,9.4162,27.019,42.009,2708.7,...,54.610,24.501,60.684,22.838,37.994,41.755,46.042,47.012,41.127,16.648
249535,0,500,36,0.25399,3628.8,4539.6,9.3916,27.064,42.585,2709.8,...,54.018,24.481,61.649,22.715,38.025,39.081,49.243,46.839,42.350,18.674
249536,0,500,37,0.21403,3667.9,4548.8,9.3216,27.103,42.239,2710.3,...,53.648,20.905,62.005,22.855,41.598,37.011,43.438,46.952,41.435,17.428
249537,0,500,38,0.21466,3690.4,4495.3,9.4264,27.148,42.470,2707.0,...,54.289,20.843,61.005,22.709,41.824,37.656,47.679,46.924,41.897,18.013


In [10]:
F_test = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Testing.csv",
                                chunk_size=CHUNK_SIZE)

Tentative de chargement du fichier : /content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Testing.csv
Chargement par morceaux de taille : 100000 lignes.
Traitement du morceau #1 (taille: 100000 lignes)...
Traitement du morceau #2 (taille: 100000 lignes)...
Traitement du morceau #3 (taille: 100000 lignes)...
Traitement du morceau #4 (taille: 100000 lignes)...
Traitement du morceau #5 (taille: 100000 lignes)...
Traitement du morceau #6 (taille: 100000 lignes)...
Traitement du morceau #7 (taille: 100000 lignes)...
Traitement du morceau #8 (taille: 100000 lignes)...
Traitement du morceau #9 (taille: 100000 lignes)...
Traitement du morceau #10 (taille: 100000 lignes)...
Traitement du morceau #11 (taille: 100000 lignes)...
Traitement du morceau #12 (taille: 100000 lignes)...
Traitement du morceau #13 (taille: 100000 lignes)...
Traitement du morceau #14 (taille: 100000 lignes)...
Traitement du morceau #15 (taille: 100000 lignes)...
Traitement du morceau #16 (taille: 100000 lignes)...
Tr

In [None]:
df = F_test
two_hour_mask = (df.loc[:,'sample']>=140) & (df.loc[:,'sample']<180)
F_test_2h = df[two_hour_mask]
F_test_2h

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,1,1,1,0.25171,3672.4,4466.3,9.5122,27.057,42.473,2705.6,...,54.494,24.527,59.710,22.357,40.149,40.074,47.955,47.300,42.100,15.345
1,1,1,2,0.25234,3642.2,4568.7,9.4145,26.999,42.586,2705.2,...,53.269,24.465,60.466,22.413,39.956,36.651,45.038,47.502,40.553,16.063
2,1,1,3,0.24840,3643.1,4507.5,9.2901,26.927,42.278,2703.5,...,54.000,24.860,60.642,22.199,40.074,41.868,44.553,47.479,41.341,20.452
3,1,1,4,0.25153,3628.3,4519.3,9.3347,26.999,42.330,2703.9,...,53.860,24.553,61.908,21.981,40.141,40.066,48.048,47.440,40.780,17.123
4,1,1,5,0.21763,3655.8,4571.0,9.3087,26.901,42.402,2707.7,...,53.307,21.775,61.891,22.412,37.696,38.295,44.678,47.530,41.089,18.681
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9599074,20,500,35,0.26840,3682.1,4545.1,9.4087,27.040,42.389,2705.8,...,54.360,26.278,60.106,22.122,38.829,36.137,47.124,48.040,41.025,17.688
9599075,20,500,36,0.26615,3680.0,4569.8,9.3188,26.568,41.799,2705.0,...,54.064,26.499,59.487,21.893,38.798,39.373,47.468,48.366,41.686,20.098
9599076,20,500,37,0.22165,3717.8,4564.0,9.3475,26.991,42.454,2702.4,...,54.674,21.529,59.768,22.168,39.194,36.705,47.258,48.363,41.073,17.805
9599077,20,500,38,0.22011,3701.4,4599.8,9.4089,26.742,42.376,2701.9,...,54.246,21.681,57.532,22.198,39.437,36.371,42.541,48.473,41.031,18.983


In [12]:
F_train = process_csv_by_chunks("/content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Training.csv",
                                chunk_size=CHUNK_SIZE)

Tentative de chargement du fichier : /content/drive/MyDrive/Colab Notebooks/raw_data/TEP_Faulty_Training.csv
Chargement par morceaux de taille : 100000 lignes.
Traitement du morceau #1 (taille: 100000 lignes)...
Traitement du morceau #2 (taille: 100000 lignes)...
Traitement du morceau #3 (taille: 100000 lignes)...
Traitement du morceau #4 (taille: 100000 lignes)...
Traitement du morceau #5 (taille: 100000 lignes)...
Traitement du morceau #6 (taille: 100000 lignes)...
Traitement du morceau #7 (taille: 100000 lignes)...
Traitement du morceau #8 (taille: 100000 lignes)...
Traitement du morceau #9 (taille: 100000 lignes)...
Traitement du morceau #10 (taille: 100000 lignes)...
Traitement du morceau #11 (taille: 100000 lignes)...
Traitement du morceau #12 (taille: 100000 lignes)...
Traitement du morceau #13 (taille: 100000 lignes)...
Traitement du morceau #14 (taille: 100000 lignes)...
Traitement du morceau #15 (taille: 100000 lignes)...
Traitement du morceau #16 (taille: 100000 lignes)...
T

In [13]:
df = F_train
two_hour_mask = df.loc[:,'sample']<40
F_train_2h = df[two_hour_mask]
F_train_2h

Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,xmeas_7,...,xmv_2,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11
0,1,1,1,0.25038,3674.0,4529.0,9.2320,26.889,42.402,2704.3,...,53.744,24.657,62.544,22.137,39.935,42.323,47.757,47.510,41.258,18.447
1,1,1,2,0.25109,3659.4,4556.6,9.4264,26.721,42.576,2705.0,...,53.414,24.588,59.259,22.084,40.176,38.554,43.692,47.427,41.359,17.194
2,1,1,3,0.25038,3660.3,4477.8,9.4426,26.875,42.070,2706.2,...,54.357,24.666,61.275,22.380,40.244,38.990,46.699,47.468,41.199,20.530
3,1,1,4,0.24977,3661.3,4512.1,9.4776,26.758,42.063,2707.2,...,53.946,24.725,59.856,22.277,40.257,38.072,47.541,47.658,41.643,18.089
4,1,1,5,0.29405,3679.0,4497.0,9.3381,26.889,42.650,2705.1,...,53.658,28.797,60.717,21.947,39.144,41.955,47.645,47.346,41.507,18.461
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999534,20,500,35,0.25317,3672.7,4491.3,9.3790,26.989,41.908,2711.9,...,54.625,24.440,60.437,18.130,38.323,41.470,45.770,46.967,41.087,17.061
4999535,20,500,36,0.25337,3628.3,4540.8,9.3464,27.028,42.462,2713.6,...,54.032,24.420,61.349,17.050,38.378,38.737,48.922,46.781,42.258,19.168
4999536,20,500,37,0.21316,3667.0,4550.6,9.2670,27.059,42.086,2714.8,...,53.670,20.820,61.643,16.085,42.132,36.602,43.061,46.885,41.351,18.016
4999537,20,500,38,0.21321,3700.0,4486.4,9.3483,27.173,42.682,2706.3,...,54.439,20.815,60.973,15.212,42.369,37.875,45.582,46.888,41.121,18.573


In [15]:
df_list = [F_train_2h, F_test_2h, FF_train_2h, FF_test_2h]

F_train_2h.to_csv('/content/drive/MyDrive/Colab Notebooks/raw_data/F_train_2h.csv')

In [16]:
df_list = [F_train_2h, F_test_2h, FF_train_2h, FF_test_2h]

F_test_2h.to_csv('/content/drive/MyDrive/Colab Notebooks/raw_data/F_test_2h.csv')

In [17]:
df_list = [F_train_2h, F_test_2h, FF_train_2h, FF_test_2h]

FF_train_2h.to_csv('/content/drive/MyDrive/Colab Notebooks/raw_data/FF_train_2h.csv')

In [18]:
df_list = [F_train_2h, F_test_2h, FF_train_2h, FF_test_2h]

FF_test_2h.to_csv('/content/drive/MyDrive/Colab Notebooks/raw_data/FF_test_2h.csv')