In [1]:
import pandas as pd
import os
import json

In [2]:
# base path du projet
BASE = '/home/locale/code/PROJECT_REACTOR/monitor-the-reactor-app'

raw_dir = os.path.join(BASE, 'raw_data')
proc_dir = os.path.join(BASE, 'processed_data')

# create processed_data directory
os.makedirs(proc_dir, exist_ok=True)

# load raw data
df_free_testing   = pd.read_csv(os.path.join(raw_dir, 'TEP_FaultFree_Testing.csv'))
df_faulty_testing = pd.read_csv(os.path.join(raw_dir, 'TEP_Faulty_Testing.csv'))
df_free_training  = pd.read_csv(os.path.join(raw_dir, 'TEP_FaultFree_Training.csv'))
df_faulty_training= pd.read_csv(os.path.join(raw_dir, 'TEP_Faulty_Training.csv'))

# combine datasets
df_testing  = pd.concat([df_free_testing, df_faulty_testing], ignore_index=True)
df_training = pd.concat([df_free_training, df_faulty_training], ignore_index=True)
df_all      = pd.concat([df_testing, df_training], ignore_index=True)

# save combined datasets
df_testing.to_csv(os.path.join(proc_dir, 'TEP_Testing.csv'), index=False)
df_training.to_csv(os.path.join(proc_dir, 'TEP_Training.csv'), index=False)
df_all.to_csv(os.path.join(proc_dir, 'TEP_All.csv'), index=False)


In [3]:
# add "label_fault"
df_free_testing["label_fault"] = 0      # 0 = normal operation
df_free_training["label_fault"] = 0
df_faulty_testing["label_fault"] = 1    # 1 = fault present
df_faulty_training["label_fault"] = 1

# combine datasets again with new label
df_testing = pd.concat([df_free_testing, df_faulty_testing], ignore_index=True)
df_training = pd.concat([df_free_training, df_faulty_training], ignore_index=True)
df_all = pd.concat([df_testing, df_training], ignore_index=True)


In [4]:
print(df_faulty_training.columns)
print(df_faulty_training["faultNumber"].value_counts())

Index(['Unnamed: 0', 'faultNumber', 'simulationRun', 'sample', 'xmeas_1',
       'xmeas_2', 'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7',
       'xmeas_8', 'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13',
       'xmeas_14', 'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19',
       'xmeas_20', 'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25',
       'xmeas_26', 'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31',
       'xmeas_32', 'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37',
       'xmeas_38', 'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2',
       'xmv_3', 'xmv_4', 'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10',
       'xmv_11', 'label_fault'],
      dtype='object')
faultNumber
1     25000
2     25000
3     25000
4     25000
5     25000
6     25000
7     25000
8     25000
9     25000
10    25000
11    25000
12    25000
13    25000
14    25000
15    25000
16    25000
17    25000
18    25000
19    25000
20    25000


In [5]:
df_all["label_fault"] = (df_all["faultNumber"] != 0).astype(int)
df_all.head()

Unnamed: 0.1,Unnamed: 0,faultNumber,simulationRun,sample,xmeas_1,xmeas_2,xmeas_3,xmeas_4,xmeas_5,xmeas_6,...,xmv_3,xmv_4,xmv_5,xmv_6,xmv_7,xmv_8,xmv_9,xmv_10,xmv_11,label_fault
0,0,0,1,1,0.25171,3672.4,4466.3,9.5122,27.057,42.473,...,24.527,59.71,22.357,40.149,40.074,47.955,47.3,42.1,15.345,0
1,1,0,1,11,0.30292,3663.3,4503.4,9.36,27.259,42.377,...,30.049,60.412,22.072,38.65,39.092,44.041,47.334,40.647,15.775,0
2,2,0,1,21,0.29717,3700.2,4486.9,9.4385,26.654,42.247,...,29.109,59.895,21.414,41.923,39.114,48.41,47.489,41.305,20.886,0
3,3,0,1,31,0.2524,3685.4,4490.4,9.4279,27.338,42.206,...,24.469,60.829,22.033,42.021,39.235,41.642,48.81,41.276,19.062,0
4,4,0,1,41,0.26068,3606.0,4466.1,9.2702,27.281,41.913,...,25.786,63.168,22.002,41.666,41.779,49.062,50.584,40.415,17.193,0
