## Подготовка датасета для последующей работы


#### Импорт библиотек

In [1]:
import pandas as pd
import numpy as np

#### Источник данных 
[NASA Turbofan Engine Degradation Dataset](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/) 

#### Загрузка данных

In [19]:
col_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']

# данные для обучения
df_train = pd.read_csv('data/PM_train.txt', sep=' ', header=None)
df_train.drop(df_train.columns[[26, 27]], axis=1, inplace=True)
df_train.columns = col_names

In [20]:
df_train.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [21]:
# данные для теста
df_test = pd.read_csv('data/PM_test.txt', sep=' ', header=None)
df_test.drop(df_test.columns[[26, 27]], axis=1, inplace=True)
df_test.columns = col_names
df_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [22]:
# Время до отказа
ttf_label = pd.read_csv('data/PM_truth.txt', sep=' ', header=None).drop([1], axis=1)
ttf_label.columns = ['ttf']
ttf_label = ttf_label.set_index(ttf_label.index + 1)
ttf_label.head()

Unnamed: 0,ttf
1,112
2,98
3,69
4,82
5,91


#### Feature Extraction или Генерация новых параметров

In [23]:
def make_features(data, roll_win):
    
    sensor_cols = ['s'+str(n) for n in range(1,22)]
    
    new_df = pd.DataFrame()
    min_cols = [col+'_min' for col in sensor_cols]
    max_cols = [col+'_max' for col in sensor_cols]
    std_cols = [col+'_std' for col in sensor_cols]
    
    for eid in data.id.unique():
        
        subset = data[data.id == eid][sensor_cols]
        
        #min, max, std
        mindf = subset.rolling(roll_win, min_periods=1).min()
        mindf.columns = min_cols
        maxdf = subset.rolling(roll_win, min_periods=1).max()
        maxdf.columns = max_cols
        stddf = subset.rolling(roll_win, min_periods=1).std().fillna(0)
        stddf.columns = std_cols
        
        new_feat = pd.concat([data[data.id == eid], mindf, maxdf, stddf], axis=1)
        new_df = pd.concat([new_df, new_feat])
        
    del mindf, maxdf, stddf, sensor_cols, new_feat
    
    return new_df      

In [24]:
df_train = make_features(df_train, 10)

In [25]:
df_test = make_features(df_test, 10)

#### Для задачи регрессии необходимо посчитать RUL для train датасета

In [26]:
def make_RUL(data):
    
    data['RUL'] = data.groupby(['id'])['cycle'].transform(max)-df_train['cycle']
    return data

In [27]:
df_train = make_RUL(df_train)
df_train.RUL[:5]

0    191
1    190
2    189
3    188
4    187
Name: RUL, dtype: int64

#### RUL для тестовых данных

In [28]:
ttf_label['max'] = df_test.groupby('id')['cycle'].max() + ttf_label['ttf']
df_test['RUL'] = [ttf_label['max'][i] for i in df_test.id] - df_test['cycle']

#### Для задачи классификации сделаем новую разметку RUL. 

Если отказ двигателя произойдет менее чем через 60 единиц времени (допустим, это час) - *класс 1*. 

Если отказ произойдет менее чем через 15 единиц времени (допустим, это 15 минут) - *класс 2*.

Иначе - *класс 0*.

In [29]:
def make_3_classes(data, win1, win2):
    
    labels = []
    for i in data.RUL:
        if i > 60:
            labels.append(0)
        elif (i <= win1) & (i > win2):
            labels.append(1)
        else:
            labels.append(2)
    
    data['label'] = labels
    
    return data

In [30]:
border_1 = 60
border_2 = 15

df_train = make_3_classes(df_train, border_1, border_2)

In [31]:
df_test = make_3_classes(df_test, border_1, border_2)

In [32]:
df_train.groupby(['label']).count()['id']

label
0    14531
1     4500
2     1600
Name: id, dtype: int64

In [33]:
df_test.groupby(['label']).count()['RUL']

label
0    11856
1     1180
2       60
Name: RUL, dtype: int64

In [36]:
# сохраним готовый датасет для обучения
df_train.to_parquet('data/train_data.parquet')

In [34]:
# сохраним тестовый датасет
df_test.to_parquet('data/test_data.parquet')