# Construction des features pour les données de télémétrie

In [2]:
import pandas as pd

On importe les données et on convertit le type de la colonne de temps.

In [3]:
df = pd.read_csv("../data/raw/PdM_telemetry.csv")
df["datetime"] = pd.to_datetime(df["datetime"])
df

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.747490,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511
...,...,...,...,...,...,...
876095,2016-01-01 02:00:00,100,179.438162,395.222827,102.290715,50.771941
876096,2016-01-01 03:00:00,100,189.617555,446.207972,98.180607,35.123072
876097,2016-01-01 04:00:00,100,192.483414,447.816524,94.132837,48.314561
876098,2016-01-01 05:00:00,100,165.475310,413.771670,104.081073,44.835259


On divise la dataframe en créant une dataframe pour chaque machine.

In [4]:
df_by_machine = df.groupby("machineID")
n_machines = df["machineID"].nunique()
dfs = [] # liste contenant tous les dataframes
for i in range(1,n_machines+1):
    dfs.append(df_by_machine.get_group(i))

In [5]:
dfs[0].head(10) # machine 1

Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511
5,2015-01-01 11:00:00,1,172.504839,430.323362,95.927042,35.655017
6,2015-01-01 12:00:00,1,156.556031,499.071623,111.755684,42.75392
7,2015-01-01 13:00:00,1,172.522781,409.624717,101.001083,35.482009
8,2015-01-01 14:00:00,1,175.324524,398.648781,110.624361,45.482287
9,2015-01-01 15:00:00,1,169.218423,460.85067,104.84823,39.901735


## Construction des features pour une machine

Pour chaque machine, on aggrége les données télémétriques par tranche de 3 heures toutes les 3 heures puis on calcule la moyenne et l'écart type.

1ère solution : on fait glisser la fenêtre et on enlève 2 entrées sur 3. -> pas optimisé

In [6]:
df_lag = dfs[0].rolling(window=3).agg([np.mean,np.std])
df_lag[2::3]

Unnamed: 0_level_0,machineID,machineID,volt,volt,rotate,rotate,pressure,pressure,vibration,vibration
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
2,1.0,0.0,170.028993,6.721032,449.533798,67.849599,94.592122,18.934956,40.893502,5.874970
5,1.0,0.0,164.192565,7.596570,403.949857,50.120452,105.687417,8.555032,34.255891,7.662229
8,1.0,0.0,168.134445,10.124584,435.781707,55.084734,107.793709,5.909721,41.239405,5.169304
11,1.0,0.0,165.514453,4.673269,430.472823,42.047278,101.703289,4.554047,40.373739,2.106108
14,1.0,0.0,168.809347,14.752132,437.111120,47.048609,90.911060,4.244158,41.738542,2.207884
...,...,...,...,...,...,...,...,...,...,...
8747,1.0,0.0,165.377347,10.582380,427.904798,37.471381,101.430181,9.199008,39.804625,2.666928
8750,1.0,0.0,173.823921,13.685410,411.557599,40.938316,106.767141,12.635729,38.142952,9.392896
8753,1.0,0.0,159.011758,12.223039,494.917558,39.544315,90.697539,13.358124,39.491593,4.552921
8756,1.0,0.0,161.803419,9.660275,433.438436,58.861046,90.529167,5.214735,44.462013,0.666334


Une meilleure solution consisterait à grouper les valeurs par tranche de 3 heures avec pd.Grouper().

In [7]:
df_lag = dfs[0].groupby(pd.Grouper(freq="3h",key="datetime")).agg([np.mean,np.std])
df_lag

Unnamed: 0_level_0,machineID,machineID,volt,volt,rotate,rotate,pressure,pressure,vibration,vibration
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
datetime,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
2015-01-01 06:00:00,1,0.0,170.028993,6.721032,449.533798,67.849599,94.592122,18.934956,40.893502,5.874970
2015-01-01 09:00:00,1,0.0,164.192565,7.596570,403.949857,50.120452,105.687417,8.555032,34.255891,7.662229
2015-01-01 12:00:00,1,0.0,168.134445,10.124584,435.781707,55.084734,107.793709,5.909721,41.239405,5.169304
2015-01-01 15:00:00,1,0.0,165.514453,4.673269,430.472823,42.047278,101.703289,4.554047,40.373739,2.106108
2015-01-01 18:00:00,1,0.0,168.809347,14.752132,437.111120,47.048609,90.911060,4.244158,41.738542,2.207884
...,...,...,...,...,...,...,...,...,...,...
2015-12-31 18:00:00,1,0.0,173.823921,13.685410,411.557599,40.938316,106.767141,12.635729,38.142952,9.392896
2015-12-31 21:00:00,1,0.0,159.011758,12.223039,494.917558,39.544315,90.697539,13.358124,39.491593,4.552921
2016-01-01 00:00:00,1,0.0,161.803419,9.660275,433.438436,58.861046,90.529167,5.214735,44.462013,0.666334
2016-01-01 03:00:00,1,0.0,177.076824,6.284211,425.010191,44.056871,97.915512,4.138817,40.671583,6.773770


## Construction des features pour toutes les machines

On peut faire ce qu'on a fait précédemment pour toutes les machines en même temps avec :

In [8]:
lag_df = df.groupby(["machineID",pd.Grouper(freq="3h",key="datetime",closed="right",label="right")]).agg([np.mean,np.std])

In [9]:
lag_df

Unnamed: 0_level_0,Unnamed: 1_level_0,volt,volt,rotate,rotate,pressure,pressure,vibration,vibration
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,std,mean,std,mean,std,mean,std
machineID,datetime,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
1,2015-01-01 06:00:00,176.217853,,418.504078,,113.077935,,45.087686,
1,2015-01-01 09:00:00,165.443986,4.807415,425.415550,92.702671,93.315664,17.106476,39.571655,4.808836
1,2015-01-01 12:00:00,162.223630,8.919370,454.923953,38.316408,106.523125,9.176711,34.799816,8.414362
1,2015-01-01 15:00:00,172.355243,3.056496,423.041389,33.200513,105.491224,4.843754,40.288677,5.011355
1,2015-01-01 18:00:00,160.226142,6.853823,440.413573,54.501054,95.424693,8.931082,41.776012,2.889493
...,...,...,...,...,...,...,...,...,...
100,2015-12-31 18:00:00,184.227378,17.142785,448.912608,88.867750,102.844531,3.042814,40.035249,8.398433
100,2015-12-31 21:00:00,151.569525,12.249429,488.417420,68.804866,97.080984,16.565632,33.330334,5.275556
100,2016-01-01 00:00:00,163.439806,9.755055,466.546771,70.100423,109.397201,7.300565,43.317103,4.333373
100,2016-01-01 03:00:00,177.266129,13.568463,440.894903,43.260932,100.687019,2.198731,43.425922,7.868190


Chaque fenêtre est datée par la dernière date de la fenêtre : la moyenne pour "2015-01-01 09:00:00" correspond à la moyenne des données prises à 7, 8 et 9h.

```
      |     |     !  <---- LIGNES AGREGéES
6h    7h    8h    9h
|_____|_____|_____|  <---- FENêTRE CORRESPONDANTE 
```

Ici, la première date est "2016-01-01 06:00:00" et devrait être agrégée avec les données des deux entrées précédentes sauf qu'elles n'existent pas. Pandas a pris la valeurs des capteurs à 6h pour la moyenne et a mis NaN pour l'écart-type (il n'y a qu'une seule valeur disponible).
*--->* **Je propose de supprimer cette ligne pour chaque machine.**

In [15]:
lag_df_flat = lag_df.reset_index() # indexation simple pour les index

lag_df_flat.columns = ['_'.join(col) if col[1]!="" else col[0] for col in lag_df_flat.columns.values] # indexation simple pour les colonnes

# On supprime la 1ère date pour chaque machine
first_datetime = lag_df_flat["datetime"].min()
lag_df_flat = lag_df_flat[lag_df_flat["datetime"] > first_datetime]

In [17]:
lag_df_flat[lag_df_flat["machineID"]==100]

Unnamed: 0,machineID,datetime,volt_mean,volt_std,rotate_mean,rotate_std,pressure_mean,pressure_std,vibration_mean,vibration_std
289180,100,2015-01-01 09:00:00,163.970945,10.378482,456.323216,74.137920,98.078753,16.850804,41.602041,4.176891
289181,100,2015-01-01 12:00:00,179.627494,20.174842,387.321388,73.564154,92.771790,5.468086,39.593631,4.196175
289182,100,2015-01-01 15:00:00,170.574222,22.394844,416.414893,57.426255,106.619807,10.652959,40.086233,5.039708
289183,100,2015-01-01 18:00:00,168.805984,20.822660,439.462174,31.295271,96.802974,10.110256,42.137914,1.745125
289184,100,2015-01-01 21:00:00,167.854215,14.091381,494.223985,31.737434,91.536514,7.925994,42.721572,7.912096
...,...,...,...,...,...,...,...,...,...,...
292095,100,2015-12-31 18:00:00,184.227378,17.142785,448.912608,88.867750,102.844531,3.042814,40.035249,8.398433
292096,100,2015-12-31 21:00:00,151.569525,12.249429,488.417420,68.804866,97.080984,16.565632,33.330334,5.275556
292097,100,2016-01-01 00:00:00,163.439806,9.755055,466.546771,70.100423,109.397201,7.300565,43.317103,4.333373
292098,100,2016-01-01 03:00:00,177.266129,13.568463,440.894903,43.260932,100.687019,2.198731,43.425922,7.868190


In [26]:
def transform_telemetry():
    """Transforme les données télémétriques.
    
    """
    #
    df = pd.read_csv("../data/raw/PdM_telemetry.csv")
    df["datetime"] = pd.to_datetime(df["datetime"])
    #
    lag_df = df.groupby(["machineID",pd.Grouper(freq="3h",key="datetime",closed="right",label="right")]).agg([np.mean,np.std])
    lag_df_flat = lag_df.reset_index() # indexation simple pour les index
    lag_df_flat.columns = ['_'.join(col) if col[1]!="" else col[0] for col in lag_df_flat.columns.values] # indexation simple pour les colonnes
    # On supprime la 1ère date pour chaque machine
    first_datetime = lag_df_flat["datetime"].min()
    lag_df_flat = lag_df_flat[lag_df_flat["datetime"] > first_datetime]
    lag_df_flat.to_csv("../data/prepared_data/telemetrie.csv",index=False)

In [27]:
transform_telemetry()

## Alternative pour le calcul des features avec une fenêtre glissante

Si on veut faire une fenêtre glissante de 3 heures pour toutes les heures(plus long à calculer) :

In [18]:
lag_df2 = df.groupby("machineID").rolling(window=3).agg([np.mean,np.std])
lag_df2.loc[100]

Unnamed: 0_level_0,machineID,machineID,volt,volt,rotate,rotate,pressure,pressure,vibration,vibration
Unnamed: 0_level_1,mean,std,mean,std,mean,std,mean,std,mean,std
867339,,,,,,,,,,
867340,,,,,,,,,,
867341,100.0,2.264780e-07,159.615566,5.003975,410.116312,22.587648,96.545326,15.380333,39.413594,2.947742
867342,100.0,2.264780e-07,163.970945,10.378482,456.323216,74.137920,98.078753,16.850804,41.602041,4.176891
867343,100.0,2.264780e-07,176.072401,22.888746,478.272366,53.555110,91.740625,16.207489,44.026158,1.231817
...,...,...,...,...,...,...,...,...,...,...
876095,100.0,2.264780e-07,170.306956,8.457322,438.653616,43.021553,102.346227,0.785721,47.481232,3.198928
876096,100.0,2.264780e-07,177.266129,13.568463,440.894903,43.260932,100.687019,2.198731,43.425922,7.868190
876097,100.0,2.264780e-07,187.179710,6.855798,429.749108,29.911451,98.201386,4.078979,44.736524,8.415672
876098,100.0,2.264780e-07,182.525427,14.835200,435.932056,19.208302,98.798172,5.002788,42.757630,6.836757
