## Подготовка датасета 

#### Импорт библиотек

In [4]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

#### Источник данных 
[NASA Turbofan Engine Degradation Dataset](https://ti.arc.nasa.gov/tech/dash/groups/pcoe/prognostic-data-repository/) 

#### Загрузка данных

In [5]:
col_names = ['id','cycle','setting1','setting2','setting3','s1','s2','s3','s4','s5','s6','s7','s8','s9','s10','s11','s12','s13','s14','s15','s16','s17','s18','s19','s20','s21']

# данные для обучения
df_train = pd.read_csv('data/PM_train.txt', sep=' ', header=None)
df_train.drop(df_train.columns[[26, 27]], axis=1, inplace=True)
df_train.columns = col_names

In [6]:
df_train.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [7]:
# данные для теста
df_test = pd.read_csv('data/PM_test.txt', sep=' ', header=None)
df_test.drop(df_test.columns[[26, 27]], axis=1, inplace=True)
df_test.columns = col_names
df_test.head()

Unnamed: 0,id,cycle,setting1,setting2,setting3,s1,s2,s3,s4,s5,...,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,1,1,0.0023,0.0003,100.0,518.67,643.02,1585.29,1398.21,14.62,...,521.72,2388.03,8125.55,8.4052,0.03,392,2388,100.0,38.86,23.3735
1,1,2,-0.0027,-0.0003,100.0,518.67,641.71,1588.45,1395.42,14.62,...,522.16,2388.06,8139.62,8.3803,0.03,393,2388,100.0,39.02,23.3916
2,1,3,0.0003,0.0001,100.0,518.67,642.46,1586.94,1401.34,14.62,...,521.97,2388.03,8130.1,8.4441,0.03,393,2388,100.0,39.08,23.4166
3,1,4,0.0042,0.0,100.0,518.67,642.44,1584.12,1406.42,14.62,...,521.38,2388.05,8132.9,8.3917,0.03,391,2388,100.0,39.0,23.3737
4,1,5,0.0014,0.0,100.0,518.67,642.51,1587.19,1401.92,14.62,...,522.15,2388.03,8129.54,8.4031,0.03,390,2388,100.0,38.99,23.413


In [8]:
# Время до отказа
ttf_label = pd.read_csv('data/PM_truth.txt', sep=' ', header=None).drop([1], axis=1)
ttf_label.columns = ['ttf']
ttf_label = ttf_label.set_index(ttf_label.index + 1)
ttf_label.head()

Unnamed: 0,ttf
1,112
2,98
3,69
4,82
5,91


Проверим наличие пропусков в данных

In [25]:
df_train.isna().values.any()

False

In [26]:
df_test.isna().values.any()

False

#### Для задачи регрессии необходимо посчитать RUL 

In [9]:
def make_RUL(data):
    
    data['RUL'] = data.groupby(['id'])['cycle'].transform(max)-df_train['cycle']
    return data

In [10]:
df_train = make_RUL(df_train)
df_train.RUL[:5]

0    191
1    190
2    189
3    188
4    187
Name: RUL, dtype: int64

In [11]:
ttf_label['max'] = df_test.groupby('id')['cycle'].max() + ttf_label['ttf']
df_test['RUL'] = [ttf_label['max'][i] for i in df_test.id] - df_test['cycle']

#### Для задачи классификации сделаем новую разметку RUL. 

Если отказ двигателя произойдет менее чем через 60 единиц времени (допустим, это час) - *класс 1*. 

Если отказ произойдет менее чем через 15 единиц времени (допустим, это 15 минут) - *класс 2*.

Иначе - *класс 0*.

In [12]:
def make_3_classes(data, win1, win2):
    
    labels = []
    for i in data.RUL:
        if i > 60:
            labels.append(0)
        elif (i <= win1) & (i > win2):
            labels.append(1)
        else:
            labels.append(2)
    
    data['label'] = labels
    
    return data

In [13]:
border_1 = 60
border_2 = 15

df_train = make_3_classes(df_train, border_1, border_2)

In [14]:
df_test = make_3_classes(df_test, border_1, border_2)

In [15]:
df_train.groupby(['label']).count()['id']

label
0    14531
1     4500
2     1600
Name: id, dtype: int64

In [35]:
df_test.groupby(['label']).count()['RUL']

label
0    11856
1     1180
2       60
Name: RUL, dtype: int64

In [14]:
# сохраним готовый датасет для обучения
df_train.to_parquet('data/train_data.parquet')

In [15]:
# сохраним тестовый датасет
df_test.to_parquet('data/test_data.parquet')

#### Обогащение датасета (Feature engineering)

- Используем библиотеку tsfresh для генерации новых фичей

In [16]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import make_forecasting_frame
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import MinimalFCParameters, EfficientFCParameters ,ComprehensiveFCParameters

from tqdm import tqdm_notebook as tqdm

In [17]:
params = {'variance_larger_than_standard_deviation': None,
 'sum_values': None,
 'abs_energy': None,
 'mean_abs_change': None,
 'mean_change': None,
 'mean_second_derivative_central': None,
 'median': None,
 'mean': None,
 'length': None,
 'standard_deviation': None,
 'variance': None,
 'skewness': None,
 'kurtosis': None,
 'absolute_sum_of_changes': None,
 'longest_strike_below_mean': None,
 'longest_strike_above_mean': None,
 'count_above_mean': None,
 'count_below_mean': None,
 'last_location_of_maximum': None,
 'first_location_of_maximum': None,
 'last_location_of_minimum': None,
 'first_location_of_minimum': None,
 'ratio_value_number_to_time_series_length': None,
 'maximum': None,
 'minimum': None,
 'c3': [{'lag': 1}, {'lag': 2}, {'lag': 3}],
 'cid_ce': [{'normalize': True}, {'normalize': False}],
 'quantile': [{'q': 0.1},
  {'q': 0.2},
  {'q': 0.3},
  {'q': 0.4},
  {'q': 0.6},
  {'q': 0.7},
  {'q': 0.8},
  {'q': 0.9}],
 'autocorrelation': [{'lag': 0},
  {'lag': 1},
  {'lag': 2},
  {'lag': 3},
  {'lag': 4},
  {'lag': 5},
  {'lag': 6},
  {'lag': 7},
  {'lag': 8},
  {'lag': 9}],
 'partial_autocorrelation': [{'lag': 0},
  {'lag': 1},
  {'lag': 2},
  {'lag': 3},
  {'lag': 4},
  {'lag': 5},
  {'lag': 6},
  {'lag': 7},
  {'lag': 8},
  {'lag': 9}],
 'number_cwt_peaks': [{'n': 1}, {'n': 5}],
 'number_peaks': [{'n': 5}, {'n': 10}, {'n': 50}],
 'spkt_welch_density': [{'coeff': 2}, {'coeff': 5}, {'coeff': 8}],
 'ar_coefficient': [{'coeff': 0, 'k': 10},
  {'coeff': 1, 'k': 10},
  {'coeff': 2, 'k': 10},
  {'coeff': 3, 'k': 10},
  {'coeff': 4, 'k': 10}],
 'linear_trend': [{'attr': 'pvalue'},
  {'attr': 'rvalue'},
  {'attr': 'intercept'},
  {'attr': 'slope'},
  {'attr': 'stderr'}],
 'augmented_dickey_fuller': [{'attr': 'teststat'},
  {'attr': 'pvalue'},
  {'attr': 'usedlag'}],
 'ratio_beyond_r_sigma': [{'r': 0.5},
  {'r': 1.5},
  {'r': 2},
  {'r': 2.5}]}

In [18]:
def make_features(df):
    
    ts = df.drop(['RUL', 'label'], axis=1)
    tst =  extract_features(ts,
                            column_id = "id", 
                            column_sort = "cycle", 
                            impute_function=impute, 
                            default_fc_parameters=params,
                            n_jobs=4,
                            show_warnings=False)
    
    for col in tqdm(tst.columns):
        if list(tst[col].unique())==[0]:
            tst.drop(col, axis=1,inplace=True)
        else: 
            lst=[]
            for idd in df.id.unique():
                lst.extend([float(tst[tst.index==idd][col])]*len(df[df.id==idd]))
            df = pd.concat([df, pd.Series(lst, name=col)], axis=1)
    
    return df

In [19]:
df_train_feat = make_features(df_train)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [00:45<00:00,  1.90s/it]
 's10__ar_coefficient__k_10__coeff_2' 's10__ar_coefficient__k_10__coeff_3'
 's10__ar_coefficient__k_10__coeff_4'
 's10__augmented_dickey_fuller__attr_"pvalue"'
 's10__augmented_dickey_fuller__attr_"teststat"'
 's10__autocorrelation__lag_0' 's10__autocorrelation__lag_1'
 's10__autocorrelation__lag_2' 's10__autocorrelation__lag_3'
 's10__autocorrelation__lag_4' 's10__autocorrelation__lag_5'
 's10__autocorrelation__lag_6' 's10__autocorrelation__lag_7'
 's10__autocorrelation__lag_8' 's10__autocorrelation__lag_9'
 's10__partial_autocorrelation__lag_2'
 's10__partial_autocorrelation__lag_3'
 's10__partial_autocorrelation__lag_4'
 's10__partial_autocorrelation__lag_5'
 's10__partial_autocorrelation__lag_6'
 's10__partial_autocorrelation__lag_7'
 's10__partial_autocorrelation__lag_8'
 's10__partial_autocorrelation__lag_9'
 's16__ar_coefficient__k_10__coeff_0' 's16__ar_coeffic

HBox(children=(IntProgress(value=0, max=1992), HTML(value='')))




In [20]:
# сохраним готовый датасет для обучения
df_train_feat.to_parquet('data/train_data_feat.parquet')

In [21]:
df_test_feat = make_features(df_test)

Feature Extraction: 100%|██████████████████████████████████████████████████████████████| 20/20 [00:36<00:00,  1.17s/it]
 's10__ar_coefficient__k_10__coeff_2' 's10__ar_coefficient__k_10__coeff_3'
 's10__ar_coefficient__k_10__coeff_4'
 's10__augmented_dickey_fuller__attr_"pvalue"'
 's10__augmented_dickey_fuller__attr_"teststat"'
 's10__autocorrelation__lag_0' 's10__autocorrelation__lag_1'
 's10__autocorrelation__lag_2' 's10__autocorrelation__lag_3'
 's10__autocorrelation__lag_4' 's10__autocorrelation__lag_5'
 's10__autocorrelation__lag_6' 's10__autocorrelation__lag_7'
 's10__autocorrelation__lag_8' 's10__autocorrelation__lag_9'
 's10__partial_autocorrelation__lag_2'
 's10__partial_autocorrelation__lag_3'
 's10__partial_autocorrelation__lag_4'
 's10__partial_autocorrelation__lag_5'
 's10__partial_autocorrelation__lag_6'
 's10__partial_autocorrelation__lag_7'
 's10__partial_autocorrelation__lag_8'
 's10__partial_autocorrelation__lag_9'
 's16__ar_coefficient__k_10__coeff_0' 's16__ar_coeffic

HBox(children=(IntProgress(value=0, max=1992), HTML(value='')))




In [22]:
# сохраним готовый датасет для обучения
df_test_feat.to_parquet('data/test_data_feat.parquet')