# tsfresh python library for feature extraction

In [1]:
from tsfresh import extract_features, feature_selection
import pandas as pd
import numpy as np
import os
from csv import reader

- Having in mind the previous feature engineering using the `Helper.py` we will first trim our time series before using `tsfresh`.
## Feature Engineering of the ram position time series

In [2]:
eps = 0.00001  #  minimum value, values below will be discarded (trim value)
data_path = os.getcwd() + "/Data/"

bin_id_rampos = list()
ramposition = list()
with open(data_path + '/ramposition.csv', 'r') as rampos_file:
    raw_buffer = reader(rampos_file)
    row_counter = 0
    for row in raw_buffer:
        row = [float(x) for x in row]
        ramposition.append(row)

ramposition_time = list()
with open(data_path + '/ramposition_time.csv', 'r') as rampostime_file:
    raw_buffer = reader(rampostime_file)
    for row in raw_buffer:
        row = [float(x) for x in row]
        ramposition_time.append(row)

# trimming
ramposition_trimmed = list()
ramposition_time_trimmed = list()
for i in range(len(ramposition)):
    temp_rampos_list = []
    temp_rampos_time_list = []
    for j in range(len(ramposition[i])):
        if ramposition[i][j] > eps:
            temp_rampos_list.append(ramposition[i][j])
            temp_rampos_time_list.append(ramposition_time[i][j])
    ramposition_trimmed.append(temp_rampos_list)
    ramposition_time_trimmed.append(temp_rampos_time_list)

bin_counter = 0
for i in range(len(ramposition_trimmed)):
    temp = [bin_counter for _ in range(len(ramposition_trimmed[i]))]
    bin_id_rampos.append(temp)
    bin_counter += 1

In [3]:
ram_filename = 'ram_ts_fe.csv'
ram_fe_df = pd.DataFrame()  # feature engineered dataframe

if not os.path.exists(data_path + ram_filename):
    print('The ram egineered features do NOT exist! ... using to tsfresh to extract the time series features ...')
    ram_fe_df.drop(ram_fe_df.index, inplace=True)
    for i in range(len(bin_id_rampos)):
        bin_id_ts = pd.DataFrame({'bind_id': bin_id_rampos[i], 'ramposition_time': ramposition_time_trimmed[i], 'ramposition': ramposition_trimmed[i]})
        tmp_df = extract_features(bin_id_ts, column_id='bind_id')
        ram_fe_df = pd.concat([ram_fe_df, tmp_df])
    ram_fe_df.to_csv(data_path + ram_filename)
else:
    print('The ram egineered features DO exist, reading from file ...')
    ram_fe_df = pd.read_csv(data_path + ram_filename)

The ram egineered features DO exist, reading from file ...


In [4]:
# print(ram_fe_df.__len__)
ram_fe_df.head()

Unnamed: 0.1,Unnamed: 0,ramposition__variance_larger_than_standard_deviation,ramposition__has_duplicate_max,ramposition__has_duplicate_min,ramposition__has_duplicate,ramposition__sum_values,ramposition__abs_energy,ramposition__mean_abs_change,ramposition__mean_change,ramposition__mean_second_derivative_central,...,ramposition_time__permutation_entropy__dimension_6__tau_1,ramposition_time__permutation_entropy__dimension_7__tau_1,ramposition_time__query_similarity_count__query_None__threshold_0.0,"ramposition_time__matrix_profile__feature_""min""__threshold_0.98","ramposition_time__matrix_profile__feature_""max""__threshold_0.98","ramposition_time__matrix_profile__feature_""mean""__threshold_0.98","ramposition_time__matrix_profile__feature_""median""__threshold_0.98","ramposition_time__matrix_profile__feature_""25""__threshold_0.98","ramposition_time__matrix_profile__feature_""75""__threshold_0.98",ramposition_time__mean_n_absolute_max__number_of_maxima_7
0,0,0.0,0.0,0.0,1.0,5.145901,0.236719,0.000385,-0.000384,2e-06,...,-0.0,-0.0,,2.418888,5.620935,3.601704,3.1877,2.869416,4.184912,19.048301
1,1,0.0,0.0,0.0,1.0,5.187046,0.237251,0.000377,-0.000376,2e-06,...,-0.0,-0.0,,2.15798,5.620712,3.625426,3.238282,2.909337,4.319253,18.976227
2,2,0.0,0.0,0.0,1.0,5.191982,0.237414,0.000376,-0.000375,2e-06,...,-0.0,-0.0,,2.860849,5.662891,3.893693,3.511084,3.231369,4.545791,18.996207
3,3,0.0,0.0,0.0,1.0,5.1595,0.236889,0.00038,-0.00038,2e-06,...,-0.0,-0.0,,2.690142,5.595624,3.645347,3.286615,2.911088,4.346767,18.985966
4,4,0.0,0.0,0.0,1.0,5.163583,0.236972,0.000381,-0.00038,2e-06,...,-0.0,-0.0,,2.251381,5.567878,3.646612,3.212432,3.012926,4.205877,19.028627


## Feature Engineering of injection pressure time series

In [5]:
bin_id_injec_press = list()
injection_pressure = list()
with open(data_path + '/injection_pressure.csv', 'r') as injection_pressure_file:
    raw_buffer = reader(injection_pressure_file)
    for row in raw_buffer:
        row = [float(x) for x in row]
        injection_pressure.append(row)

injection_pressure_time = list()
with open(data_path + '/injection_pressure_time.csv', 'r') as injection_pressuretime_file:
    raw_buffer = reader(injection_pressuretime_file)
    for row in raw_buffer:
        row = [float(x) for x in row]
        injection_pressure_time.append(row)

# trimming
injection_pressure_trimmed = list()
injection_pressure_time_trimmed = list()

for i in range(len(injection_pressure)):
    temp_injection_pressure_list = []
    temp_injection_pressure_time_list = []
    for j in range(len(injection_pressure[i])):
        if injection_pressure[i][j] > eps:
            temp_injection_pressure_list.append(injection_pressure[i][j])
            temp_injection_pressure_time_list.append(injection_pressure_time[i][j])
    injection_pressure_trimmed.append(temp_injection_pressure_list)
    injection_pressure_time_trimmed.append(temp_injection_pressure_time_list)

bin_counter = 0
for i in range(len(injection_pressure_trimmed)):
    temp = [bin_counter for _ in range(len(injection_pressure_trimmed[i]))]
    bin_id_injec_press.append(temp)
    bin_counter += 1

In [6]:
injection_filename = 'injection_ts_fe.csv'
injection_pressure_fe_df = pd.DataFrame()  # feature engineered dataframe

if not os.path.exists(data_path + injection_filename):
    print('The injection pressure egineered features do NOT exist! ... using to tsfresh to extract the time series features ...')
    injection_pressure_fe_df.drop(injection_pressure_fe_df.index, inplace=True)
    for i in range(len(bin_id_injec_press)):
        bin_id_ts = pd.DataFrame({'bind_id': bin_id_injec_press[i], 'injection_pressure_time': injection_pressure_time_trimmed[i], 'injection_pressure': injection_pressure_trimmed[i]})
        tmp_df = extract_features(bin_id_ts, column_id='bind_id')
        injection_pressure_fe_df = pd.concat([injection_pressure_fe_df, tmp_df])
    injection_pressure_fe_df.to_csv(data_path + injection_filename)
else:
    print('The ram egineered features DO exist, reading from file ...')
    injection_pressure_fe_df = pd.read_csv(data_path + injection_filename)

The ram egineered features DO exist, reading from file ...


In [7]:
# print(injection_pressure_fe_df.__len__)
injection_pressure_fe_df.head()

Unnamed: 0.1,Unnamed: 0,injection_pressure__variance_larger_than_standard_deviation,injection_pressure__has_duplicate_max,injection_pressure__has_duplicate_min,injection_pressure__has_duplicate,injection_pressure__sum_values,injection_pressure__abs_energy,injection_pressure__mean_abs_change,injection_pressure__mean_change,injection_pressure__mean_second_derivative_central,...,injection_pressure_time__permutation_entropy__dimension_6__tau_1,injection_pressure_time__permutation_entropy__dimension_7__tau_1,injection_pressure_time__query_similarity_count__query_None__threshold_0.0,"injection_pressure_time__matrix_profile__feature_""min""__threshold_0.98","injection_pressure_time__matrix_profile__feature_""max""__threshold_0.98","injection_pressure_time__matrix_profile__feature_""mean""__threshold_0.98","injection_pressure_time__matrix_profile__feature_""median""__threshold_0.98","injection_pressure_time__matrix_profile__feature_""25""__threshold_0.98","injection_pressure_time__matrix_profile__feature_""75""__threshold_0.98",injection_pressure_time__mean_n_absolute_max__number_of_maxima_7
0,0,1.0,1.0,0.0,1.0,3450554000.0,9.806648e+16,281340.0,281340.0,-9443.884892,...,-0.0,-0.0,,2.418888,7.567208,6.172275,6.09571,6.02064,6.963578,7.036062
1,1,1.0,1.0,0.0,1.0,3332375000.0,8.748736e+16,383101.398601,111115.384615,-77290.84507,...,-0.0,-0.0,,2.15798,7.533689,6.097282,6.080317,5.918485,7.007993,7.280174
2,2,1.0,1.0,0.0,1.0,3581108000.0,9.756402e+16,311846.527778,176693.75,-44052.097902,...,-0.0,-0.0,,5.778567,7.66223,6.638699,6.368044,6.282017,7.209128,7.301684
3,3,1.0,1.0,0.0,1.0,3201094000.0,8.42029e+16,400145.774648,117540.140845,-79180.141844,...,-0.0,-0.0,,3.428503,7.515098,6.219667,6.065297,5.974919,6.998013,7.288847
4,4,1.0,1.0,0.0,1.0,3468410000.0,9.702014e+16,270164.788732,270164.788732,-9179.432624,...,-0.0,-0.0,,2.251381,7.598214,6.214613,6.234785,6.135188,7.067948,7.057249


## Feature Engineering of sensor pressure time series

In [8]:
bin_id_sens_press = list()
sensor_pressure = list()
with open(data_path + '/sensor_pressure.csv', 'r') as sensor_pressure_file:
    raw_buffer = reader(sensor_pressure_file)
    for row in raw_buffer:
        row = [float(x) for x in row]
        sensor_pressure.append(row)

sensor_pressure_time = list()
with open(data_path + '/injection_pressure_time.csv', 'r') as sensor_pressuretime_file:
    raw_buffer = reader(sensor_pressuretime_file)
    for row in raw_buffer:
        row = [float(x) for x in row]
        sensor_pressure_time.append(row)

# trimming
sensor_pressure_trimmed = list()
sensor_pressure_time_trimmed = list()

for i in range(len(sensor_pressure)):
    temp_sensor_pressure_list = []
    temp_sensor_pressure_time_list = []
    for j in range(len(sensor_pressure[i])):
        if sensor_pressure[i][j] > eps:
            temp_sensor_pressure_list.append(sensor_pressure[i][j])
            temp_sensor_pressure_time_list.append(sensor_pressure_time[i][j])
    sensor_pressure_trimmed.append(temp_sensor_pressure_list)
    sensor_pressure_time_trimmed.append(temp_sensor_pressure_time_list)

bin_counter = 0
for i in range(len(sensor_pressure_trimmed)):
    temp = [bin_counter for _ in range(len(sensor_pressure_trimmed[i]))]
    bin_id_sens_press.append(temp)
    bin_counter += 1

In [9]:
sensor_filename = 'sensor_ts_fe.csv'
sensor_pressure_fe_df = pd.DataFrame()  # feature engineered dataframe

if not os.path.exists(data_path + sensor_filename):
    print('The sensor pressure egineered features do NOT exist! ... using to tsfresh to extract the time series features ...')
    sensor_pressure_fe_df.drop(sensor_pressure_fe_df.index, inplace=True)
    for i in range(len(bin_id_sens_press)):
        bin_id_ts = pd.DataFrame({'bind_id': bin_id_sens_press[i], 'sensor_pressure_time': sensor_pressure_time_trimmed[i], 'sensor_pressure': sensor_pressure_trimmed[i]})
        tmp_df = extract_features(bin_id_ts, column_id='bind_id')
        sensor_pressure_fe_df = pd.concat([sensor_pressure_fe_df, tmp_df])
    sensor_pressure_fe_df.to_csv(data_path + sensor_filename)
else:
    print('The sensor egineered features DO exist, reading from file ...')
    sensor_pressure_fe_df = pd.read_csv(data_path + sensor_filename)

The sensor egineered features DO exist, reading from file ...


In [10]:
# print(sensor_pressure_fe_df.__len__)
sensor_pressure_fe_df.head()

Unnamed: 0.1,Unnamed: 0,sensor_pressure__variance_larger_than_standard_deviation,sensor_pressure__has_duplicate_max,sensor_pressure__has_duplicate_min,sensor_pressure__has_duplicate,sensor_pressure__sum_values,sensor_pressure__abs_energy,sensor_pressure__mean_abs_change,sensor_pressure__mean_change,sensor_pressure__mean_second_derivative_central,...,sensor_pressure_time__permutation_entropy__dimension_6__tau_1,sensor_pressure_time__permutation_entropy__dimension_7__tau_1,sensor_pressure_time__query_similarity_count__query_None__threshold_0.0,"sensor_pressure_time__matrix_profile__feature_""min""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""max""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""mean""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""median""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""25""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""75""__threshold_0.98",sensor_pressure_time__mean_n_absolute_max__number_of_maxima_7
0,0,1.0,0.0,0.0,0.0,1534169000.0,4.162549e+16,558933.333333,87076.190476,-110437.96,...,-0.0,-0.0,,0.059113,6.077445,2.148315,0.160769,0.098277,4.549259,7.036062
1,1,1.0,0.0,0.0,0.0,1467269000.0,3.476584e+16,480892.093023,82271.937984,-64233.398438,...,-0.0,-0.0,,5.212047,7.21371,5.879423,5.501688,5.369396,6.570285,7.280174
2,2,1.0,0.0,0.0,0.0,1500957000.0,3.523793e+16,475824.692308,86455.461538,-83280.232558,...,-0.0,-0.0,,5.27448,7.179703,5.876394,5.505952,5.409981,6.531158,7.301684
3,3,1.0,0.0,0.0,0.0,1482141000.0,3.709718e+16,505653.671875,82450.546875,-67555.629921,...,-0.0,-0.0,,2.392714,7.006439,5.360143,5.170089,5.110092,5.774992,7.288847
4,4,1.0,0.0,0.0,0.0,1542279000.0,4.019727e+16,527577.5,88890.0,-101513.818898,...,-0.0,-0.0,,5.258254,7.264037,6.006905,5.659694,5.494781,6.730929,7.057249


## Joining all the multivariate timeseries

In [27]:
ram_fe_df.head()

Unnamed: 0.1,Unnamed: 0,ramposition__variance_larger_than_standard_deviation,ramposition__has_duplicate_max,ramposition__has_duplicate_min,ramposition__has_duplicate,ramposition__sum_values,ramposition__abs_energy,ramposition__mean_abs_change,ramposition__mean_change,ramposition__mean_second_derivative_central,...,ramposition_time__permutation_entropy__dimension_6__tau_1,ramposition_time__permutation_entropy__dimension_7__tau_1,ramposition_time__query_similarity_count__query_None__threshold_0.0,"ramposition_time__matrix_profile__feature_""min""__threshold_0.98","ramposition_time__matrix_profile__feature_""max""__threshold_0.98","ramposition_time__matrix_profile__feature_""mean""__threshold_0.98","ramposition_time__matrix_profile__feature_""median""__threshold_0.98","ramposition_time__matrix_profile__feature_""25""__threshold_0.98","ramposition_time__matrix_profile__feature_""75""__threshold_0.98",ramposition_time__mean_n_absolute_max__number_of_maxima_7
0,0,0.0,0.0,0.0,1.0,5.145901,0.236719,0.000385,-0.000384,2e-06,...,-0.0,-0.0,,2.418888,5.620935,3.601704,3.1877,2.869416,4.184912,19.048301
1,1,0.0,0.0,0.0,1.0,5.187046,0.237251,0.000377,-0.000376,2e-06,...,-0.0,-0.0,,2.15798,5.620712,3.625426,3.238282,2.909337,4.319253,18.976227
2,2,0.0,0.0,0.0,1.0,5.191982,0.237414,0.000376,-0.000375,2e-06,...,-0.0,-0.0,,2.860849,5.662891,3.893693,3.511084,3.231369,4.545791,18.996207
3,3,0.0,0.0,0.0,1.0,5.1595,0.236889,0.00038,-0.00038,2e-06,...,-0.0,-0.0,,2.690142,5.595624,3.645347,3.286615,2.911088,4.346767,18.985966
4,4,0.0,0.0,0.0,1.0,5.163583,0.236972,0.000381,-0.00038,2e-06,...,-0.0,-0.0,,2.251381,5.567878,3.646612,3.212432,3.012926,4.205877,19.028627


In [36]:
ram_fe_df.index[-1]

1541

In [38]:
bin_fe_df = pd.DataFrame()
bin_fe_df = pd.concat([bin_fe_df, ram_fe_df], axis=1)
bin_fe_df = pd.concat([bin_fe_df, injection_pressure_fe_df], axis=1)
bin_fe_df = pd.concat([bin_fe_df, sensor_pressure_fe_df], axis=1)

In [40]:
bin_fe_df.head()

Unnamed: 0.1,Unnamed: 0,ramposition__variance_larger_than_standard_deviation,ramposition__has_duplicate_max,ramposition__has_duplicate_min,ramposition__has_duplicate,ramposition__sum_values,ramposition__abs_energy,ramposition__mean_abs_change,ramposition__mean_change,ramposition__mean_second_derivative_central,...,sensor_pressure_time__permutation_entropy__dimension_6__tau_1,sensor_pressure_time__permutation_entropy__dimension_7__tau_1,sensor_pressure_time__query_similarity_count__query_None__threshold_0.0,"sensor_pressure_time__matrix_profile__feature_""min""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""max""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""mean""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""median""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""25""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""75""__threshold_0.98",sensor_pressure_time__mean_n_absolute_max__number_of_maxima_7
0,0,0.0,0.0,0.0,1.0,5.145901,0.236719,0.000385,-0.000384,2e-06,...,-0.0,-0.0,,0.059113,6.077445,2.148315,0.160769,0.098277,4.549259,7.036062
1,1,0.0,0.0,0.0,1.0,5.187046,0.237251,0.000377,-0.000376,2e-06,...,-0.0,-0.0,,5.212047,7.21371,5.879423,5.501688,5.369396,6.570285,7.280174
2,2,0.0,0.0,0.0,1.0,5.191982,0.237414,0.000376,-0.000375,2e-06,...,-0.0,-0.0,,5.27448,7.179703,5.876394,5.505952,5.409981,6.531158,7.301684
3,3,0.0,0.0,0.0,1.0,5.1595,0.236889,0.00038,-0.00038,2e-06,...,-0.0,-0.0,,2.392714,7.006439,5.360143,5.170089,5.110092,5.774992,7.288847
4,4,0.0,0.0,0.0,1.0,5.163583,0.236972,0.000381,-0.00038,2e-06,...,-0.0,-0.0,,5.258254,7.264037,6.006905,5.659694,5.494781,6.730929,7.057249


In [41]:
# we extract the labels
valid_or_not = list()
with open(data_path + '/Y2.csv', 'r') as y2_file:
    raw_buffer = y2_file.readlines()
    for row in raw_buffer:
        valid_or_not.append(int(row))
valid_or_not_df = pd.DataFrame(valid_or_not, columns=['valid_or_not'])
bin_fe_df = pd.concat([bin_fe_df, valid_or_not_df], axis=1)

In [42]:
# print(bin_fe_df.__len__)
bin_fe_df.head()

Unnamed: 0.1,Unnamed: 0,ramposition__variance_larger_than_standard_deviation,ramposition__has_duplicate_max,ramposition__has_duplicate_min,ramposition__has_duplicate,ramposition__sum_values,ramposition__abs_energy,ramposition__mean_abs_change,ramposition__mean_change,ramposition__mean_second_derivative_central,...,sensor_pressure_time__permutation_entropy__dimension_7__tau_1,sensor_pressure_time__query_similarity_count__query_None__threshold_0.0,"sensor_pressure_time__matrix_profile__feature_""min""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""max""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""mean""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""median""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""25""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""75""__threshold_0.98",sensor_pressure_time__mean_n_absolute_max__number_of_maxima_7,valid_or_not
0,0,0.0,0.0,0.0,1.0,5.145901,0.236719,0.000385,-0.000384,2e-06,...,-0.0,,0.059113,6.077445,2.148315,0.160769,0.098277,4.549259,7.036062,1
1,1,0.0,0.0,0.0,1.0,5.187046,0.237251,0.000377,-0.000376,2e-06,...,-0.0,,5.212047,7.21371,5.879423,5.501688,5.369396,6.570285,7.280174,1
2,2,0.0,0.0,0.0,1.0,5.191982,0.237414,0.000376,-0.000375,2e-06,...,-0.0,,5.27448,7.179703,5.876394,5.505952,5.409981,6.531158,7.301684,1
3,3,0.0,0.0,0.0,1.0,5.1595,0.236889,0.00038,-0.00038,2e-06,...,-0.0,,2.392714,7.006439,5.360143,5.170089,5.110092,5.774992,7.288847,1
4,4,0.0,0.0,0.0,1.0,5.163583,0.236972,0.000381,-0.00038,2e-06,...,-0.0,,5.258254,7.264037,6.006905,5.659694,5.494781,6.730929,7.057249,1


- We have at this point $4734$ real features and one binary target. Our dataset has in total a size of $1542\times 4735$  rows times columns.
- Because we have more datapoints than features we are strongly prone to overfitting, therefore we need now to implement feature filtering.

In [83]:
Xfeatures = bin_fe_df.iloc[:, 0:4734]
ylabels = bin_fe_df.iloc[:, -1]

In [84]:
Xfeatures.head()

Unnamed: 0,ramposition__variance_larger_than_standard_deviation,ramposition__has_duplicate_max,ramposition__has_duplicate_min,ramposition__has_duplicate,ramposition__sum_values,ramposition__abs_energy,ramposition__mean_abs_change,ramposition__mean_change,ramposition__mean_second_derivative_central,ramposition__median,...,sensor_pressure_time__permutation_entropy__dimension_6__tau_1,sensor_pressure_time__permutation_entropy__dimension_7__tau_1,sensor_pressure_time__query_similarity_count__query_None__threshold_0.0,"sensor_pressure_time__matrix_profile__feature_""min""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""max""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""mean""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""median""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""25""__threshold_0.98","sensor_pressure_time__matrix_profile__feature_""75""__threshold_0.98",sensor_pressure_time__mean_n_absolute_max__number_of_maxima_7
0,0.0,0.0,0.0,1.0,5.145901,0.236719,0.000385,-0.000384,2e-06,0.026569,...,-0.0,-0.0,,0.059113,6.077445,2.148315,0.160769,0.098277,4.549259,7.036062
1,0.0,0.0,0.0,1.0,5.187046,0.237251,0.000377,-0.000376,2e-06,0.025884,...,-0.0,-0.0,,5.212047,7.21371,5.879423,5.501688,5.369396,6.570285,7.280174
2,0.0,0.0,0.0,1.0,5.191982,0.237414,0.000376,-0.000375,2e-06,0.025448,...,-0.0,-0.0,,5.27448,7.179703,5.876394,5.505952,5.409981,6.531158,7.301684
3,0.0,0.0,0.0,1.0,5.1595,0.236889,0.00038,-0.00038,2e-06,0.026017,...,-0.0,-0.0,,2.392714,7.006439,5.360143,5.170089,5.110092,5.774992,7.288847
4,0.0,0.0,0.0,1.0,5.163583,0.236972,0.000381,-0.00038,2e-06,0.02607,...,-0.0,-0.0,,5.258254,7.264037,6.006905,5.659694,5.494781,6.730929,7.057249


In [85]:
ylabels.head()

0    1
1    1
2    1
3    1
4    1
Name: valid_or_not, dtype: int64

In [86]:
print(type(Xfeatures))
print(type(ylabels))

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [87]:
feature_p_value_list = list()
for feature in Xfeatures.columns:
    if not Xfeatures[feature].isnull().values.any():
        p_value = feature_selection.significance_tests.target_binary_feature_real_test(Xfeatures[feature], ylabels, 'mann')
        feature_p_value_list.append((feature, p_value))
    else:
        feature_p_value_list.append((feature, np.NaN))

In [88]:
nan_counter = 0
for feature_p_value in feature_p_value_list:
    if np.isnan(feature_p_value[1]):
        print(f"We have NaN for the value of feature {feature_p_value[0]} with value: {feature_p_value[1]}")
        nan_counter += 1
print(nan_counter)

We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_87 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_88 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_89 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_90 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_91 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_92 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_93 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_94 with value: nan
We have NaN for the value of feature ramposition__fft_coefficient__attr_"real"__coeff_95 with value: nan
We have NaN for the value of feature ramposition__fft_c

- **p value**: a p-value is the probability that random chance generated the data, or something else that is equal or rarer.
- We will keep features with a p-value lower than 0.05 (5%).
- Rephrasing this: _we will keep features which are part of the most 95% relevant of features_
- the p-value of the feature significance test. Lower p-values indicate a higher feature significance.

In [119]:
min_p_value = 0.05
relevant_features = list()
for feature_p_value in feature_p_value_list:
    if not np.isnan(feature_p_value[1]) and feature_p_value[1] < min_p_value:
        relevant_features.append(feature_p_value[0])
print(len(relevant_features))

453


In [143]:
relevant_df = pd.DataFrame()
counter_relevant = 0
for feature in relevant_features:
    temp_series = Xfeatures[feature]
    relevant_df = pd.concat([relevant_df, temp_series], axis=1)
relevant_df = pd.concat([relevant_df, ylabels], axis=1)

In [144]:
relevant_df.head()

Unnamed: 0,ramposition__symmetry_looking__r_0.05,ramposition__number_cwt_peaks__n_1,ramposition__number_cwt_peaks__n_5,"ramposition__cwt_coefficients__coeff_4__w_2__widths_(2, 5, 10, 20)","ramposition__fft_coefficient__attr_""real""__coeff_26","ramposition__fft_coefficient__attr_""real""__coeff_27","ramposition__fft_coefficient__attr_""real""__coeff_28","ramposition__fft_coefficient__attr_""real""__coeff_34","ramposition__fft_coefficient__attr_""real""__coeff_56","ramposition__fft_coefficient__attr_""imag""__coeff_46",...,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_1,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_2,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_3,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_4,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_5,sensor_pressure_time__energy_ratio_by_chunks__num_segments_10__segment_focus_6,sensor_pressure_time__ratio_beyond_r_sigma__r_1,sensor_pressure_time__ratio_beyond_r_sigma__r_1.5,sensor_pressure_time__lempel_ziv_complexity__bins_10,valid_or_not
0,1.0,17.0,13.0,0.023171,0.031487,0.036913,0.03442,0.031473,0.032431,-0.032198,...,0.000819,0.001554,0.002516,0.003723,0.005158,0.00676,0.149606,0.11811,0.251969,1
1,0.0,15.0,15.0,0.023184,0.033079,0.03598,0.033703,0.031803,0.03282,-0.030895,...,0.00084,0.001592,0.002587,0.003822,0.005301,0.006941,0.153846,0.123077,0.246154,1
2,0.0,16.0,16.0,0.023149,0.031477,0.035799,0.035498,0.031927,0.03376,-0.03191,...,0.00094,0.001764,0.002831,0.004164,0.005747,0.007492,0.152672,0.122137,0.244275,1
3,1.0,11.0,11.0,0.023208,0.033024,0.036715,0.032597,0.031617,0.033582,-0.030984,...,0.000867,0.001646,0.002668,0.003941,0.005464,0.007167,0.155039,0.116279,0.248062,1
4,1.0,18.0,15.0,0.023171,0.032825,0.035859,0.034547,0.03193,0.033091,-0.032218,...,0.000862,0.001634,0.00265,0.003914,0.005419,0.00711,0.147287,0.116279,0.248062,1


## Saving the extracted features using tsfresh to disk

In [145]:
filename_extracted_features = data_path + 'extracted_features.csv'
relevant_df.to_csv(filename_extracted_features)

## References:
- We can improved the feature filtering of our timeseries by calculating the pvalue of the new features and then drop those who are below a defined threshold. This is actually what is done when using other libraries such **tsfresh**
- The time steps are the same for all sensor measurements, therefore tsfresh can be applied as truly time series multivariate problem. Otherwise we would need a little bit of more transformations.
- tsfresh documentation: https://tsfresh.readthedocs.io/en/latest/
- Github repository of tsfresh: https://github.com/blue-yonder/tsfresh