In [1]:
import pandas as pd
import dask.dataframe as dd
from tsfresh import extract_features, select_features
from tsfresh.feature_extraction import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute

# Extract Features

In [2]:
df = pd.read_pickle('df_feature.pkl')
df = df.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])

extracted_features = extract_features(df, column_id='id', column_sort='time', default_fc_parameters= EfficientFCParameters(), impute_function=impute)

extracted_features.to_pickle('df_feature_all.pkl')


Feature Extraction: 100%|██████████| 54/54 [00:44<00:00,  1.21it/s]


In [5]:
n_r, n_c = df.shape
print(f"row:{n_r}")
print(f"column:{n_c}")
e_r, e_c = extracted_features.shape
print(f"row:{e_r}")
print(f"column:{e_c}")
#print(extract_features.shape)

row:456039
column:4
row:135
column:1554


# Feature Selection

In [1]:
import pandas as pd
import dask.dataframe as dd
from tsfresh import extract_features, select_features

In [2]:
df = pd.read_pickle('df_feature.pkl')
X = pd.read_pickle('df_feature_all.pkl')

In [3]:
y_s_train_distinct = df.groupby('id')['RUL_rolled_class'].last()


In [4]:
from tsfresh.feature_selection.relevance import calculate_relevance_table

table = calculate_relevance_table(X, y_s_train_distinct, ml_task='classification', multiclass=True, show_warnings=False)

In [5]:
table

Unnamed: 0_level_0,feature,type,p_value_1,relevant_1,p_value_2,relevant_2,p_value_3,relevant_3,n_significant,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
y__ratio_value_number_to_time_series_length,y__ratio_value_number_to_time_series_length,real,1.282276e-22,True,3.578188e-12,True,5.885360e-09,True,3,True
"y__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""","y__agg_linear_trend__attr_""stderr""__chunk_len_...",real,1.667372e-22,True,3.470923e-12,True,7.863789e-09,True,3,True
"y__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""max""","y__agg_linear_trend__attr_""stderr""__chunk_len_...",real,1.936646e-22,True,1.192872e-11,True,2.019266e-09,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""mean""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.010431e-22,True,2.218653e-11,True,9.486667e-10,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""mean""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.610886e-22,True,1.915306e-11,True,1.495249e-09,True,3,True
...,...,...,...,...,...,...,...,...,...,...
y__large_standard_deviation__r_0.9500000000000001,y__large_standard_deviation__r_0.9500000000000001,constant,,False,,False,,False,0,False
y__partial_autocorrelation__lag_0,y__partial_autocorrelation__lag_0,constant,,False,,False,,False,0,False
"y__fft_coefficient__attr_""imag""__coeff_0","y__fft_coefficient__attr_""imag""__coeff_0",constant,,False,,False,,False,0,False
"y__augmented_dickey_fuller__attr_""pvalue""__autolag_""AIC""","y__augmented_dickey_fuller__attr_""pvalue""__aut...",constant,,False,,False,,False,0,False


In [6]:
lowest_indices_column1 = table['p_value_1'].nsmallest(20).index
lowest_indices_column2 = table['p_value_2'].nsmallest(20).index
lowest_indices_column3 = table['p_value_3'].nsmallest(20).index

# Combine the indices to get the unique rows with the lowest values in any column
lowest_indices_combined = lowest_indices_column1.union(lowest_indices_column2).union(lowest_indices_column3)

# Filter the original DataFrame to keep only the rows with the lowest values
filtered_df = table.loc[lowest_indices_combined]
filtered_df
#lowest_indices_combined

Unnamed: 0_level_0,feature,type,p_value_1,relevant_1,p_value_2,relevant_2,p_value_3,relevant_3,n_significant,relevant
feature,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"x__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""max""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,1.482445e-21,True,4.041446e-12,True,5.235427e-08,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""mean""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.6108860000000003e-22,True,1.915306e-11,True,1.495249e-09,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_10__f_agg_""min""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.563267e-21,True,1.610777e-12,True,2.345957e-07,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""max""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,3.283209e-19,True,3.920434e-12,True,6.3874e-06,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""mean""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.0104310000000001e-22,True,2.218653e-11,True,9.486667e-10,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_50__f_agg_""min""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,4.473805999999999e-19,True,2.061872e-12,True,1.485729e-05,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""max""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,5.288389000000001e-22,True,9.682601e-12,True,7.055963e-09,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""mean""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,2.8129380000000002e-22,True,1.70222e-11,True,1.873559e-09,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""min""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,8.2370380000000005e-22,True,4.997295e-12,True,2.365071e-08,True,3,True
"x__agg_linear_trend__attr_""stderr""__chunk_len_5__f_agg_""var""","x__agg_linear_trend__attr_""stderr""__chunk_len_...",real,7.722085e-16,True,2.284736e-11,True,0.0006009082,True,3,True


In [7]:
lowest_indices_combined

Index(['x__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"max"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"mean"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_10__f_agg_"min"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"max"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"mean"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_50__f_agg_"min"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"max"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"mean"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"min"',
       'x__agg_linear_trend__attr_"stderr"__chunk_len_5__f_agg_"var"',
       'x__cid_ce__normalize_True', 'x__count_below_mean',
       'x__fft_aggregated__aggtype_"centroid"', 'x__length',
       'x__number_cwt_peaks__n_5', 'x__number_peaks__n_10',
       'x__number_peaks__n_3', 'x__number_peaks__n_50',
       'x__permutation_entropy__dimension_7__

In [8]:
X_selected = select_features(X, y_s_train_distinct, ml_task='classification', multiclass=True, show_warnings=True, fdr_level= 0.00001)



In [9]:
X_selected = X.loc[:, X.columns.isin(lowest_indices_combined)]

In [10]:
X_selected.to_pickle('df_feature_selected.pkl')

# Relevant Features S Train

In [11]:
import tsfresh
import pandas as pd

In [12]:
df_s_train = pd.read_pickle('df_s_train_rolled.pkl')
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(pd.read_pickle('df_feature_selected.pkl'))

In [13]:
df_s_train = df_s_train.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
#ddf_s_train = dd.from_pandas(df_s_train, npartitions=20)

In [None]:
del [ddf_helper]
del [df_s_train]

In [14]:
X_s_train   = extract_features(df_s_train, column_id='id', column_sort='time', kind_to_fc_parameters= kind_to_fc_parameters)

Feature Extraction: 100%|██████████| 53/53 [86:35:56<00:00, 5882.19s/it]   


In [15]:
X_s_train.to_pickle('X_s_train.pkl')

In [16]:
df_s_test = pd.read_pickle('df_s_test_rolled.pkl')
df_s_test = df_s_test.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
X_s_test   = extract_features(df_s_test, column_id='id', column_sort='time', kind_to_fc_parameters= kind_to_fc_parameters)
X_s_test.to_pickle('X_s_test.pkl')

Feature Extraction: 100%|██████████| 46/46 [28:31:55<00:00, 2232.95s/it]   


In [17]:
df_t_train = pd.read_pickle('df_t_train_rolled.pkl')
df_t_train = df_t_train.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
X_t_train = extract_features(df_t_train, column_id='id', column_sort='time', kind_to_fc_parameters= kind_to_fc_parameters)
X_t_train.to_pickle('X_t_train.pkl')

Feature Extraction: 100%|██████████| 42/42 [5:43:13<00:00, 490.31s/it]   


In [18]:
df_t_test = pd.read_pickle('df_t_test_rolled.pkl')
df_t_test = df_t_test.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
X_t_test = extract_features(df_t_test, column_id='id', column_sort='time', kind_to_fc_parameters= kind_to_fc_parameters)
X_t_test.to_pickle('X_t_test.pkl')

Feature Extraction: 100%|██████████| 8/8 [12:35<00:00, 94.46s/it] 


In [None]:
y_t_test = pd.read_pickle('df_t_test_rolled.pkl')
y_t_test_distinct = y_t_test.groupby('id')['RUL_rolled_class'].last()
y_t_test_distinct.to_pickle('y_t_test.pkl')

In [None]:
y_t_train = pd.read_pickle('df_t_train_rolled.pkl')
y_t_train_distinct = y_t_train.groupby('id')['RUL_rolled_class'].last()
y_t_train_distinct.to_pickle('y_t_train.pkl')

In [None]:
y_s_test = pd.read_pickle('df_s_test_rolled.pkl')
y_s_test_distinct = y_s_test.groupby('id')['RUL_rolled_class'].last()
y_s_test_distinct.to_pickle('y_s_test.pkl')

In [None]:
y_s_train = pd.read_pickle('df_s_train_rolled.pkl')
y_s_train_distinct = y_s_train.groupby('id')['RUL_rolled_class'].last()
y_s_train_distinct.to_pickle('y_s_train.pkl')

# Playground

In [3]:
df_s_train = pd.read_pickle('df_s_train_rolled.pkl')
df_s_train = df_s_train.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
X_s_train = extract_features(df_s_train, column_id='id', column_sort='time', default_fc_parameters=MinimalFCParameters())


Feature Extraction: 100%|██████████| 53/53 [00:35<00:00,  1.48it/s]


In [4]:
X_s_train.to_pickle('X_s_train_min.pkl')

In [5]:
df_s_test = pd.read_pickle('df_s_test_rolled.pkl')
df_s_test = df_s_test.drop(columns=['time_end', 'bearing_id', 'RUL_rolled_class', 'RUL_rolled'])
X_s_test = extract_features(df_s_test, column_id='id', column_sort='time', default_fc_parameters=MinimalFCParameters())

Feature Extraction: 100%|██████████| 46/46 [00:13<00:00,  3.30it/s]


In [6]:
X_s_test.to_pickle('X_s_test_min.pkl')

In [4]:
kind_to_fc_parameters = tsfresh.feature_extraction.settings.from_columns(pd.read_pickle('df_feature_selected.pkl'))

NameError: name 'tsfresh' is not defined

In [9]:
import pandas as pd
test = pd.read_pickle('df_s_train_rolled.pkl')
test

Unnamed: 0,id,time,x,y,bearing_id,time_end,RUL_rolled_class,RUL_rolled
0,"(1, 2023-01-01 10:13:19.585000)",2023-01-01 09:40:44.664,0.832317,-0.260035,1,2023-01-01 10:13:19.585,1,0.930479
1,"(1, 2023-01-01 10:13:19.585000)",2023-01-01 09:40:44.742,0.201220,0.753927,1,2023-01-01 10:13:19.585,1,0.930479
2,"(1, 2023-01-01 10:13:19.585000)",2023-01-01 09:40:44.820,-1.231707,0.029668,1,2023-01-01 10:13:19.585,1,0.930479
3,"(1, 2023-01-01 10:13:19.585000)",2023-01-01 09:40:44.898,0.487805,0.511344,1,2023-01-01 10:13:19.585,1,0.930479
4,"(1, 2023-01-01 10:13:19.585000)",2023-01-01 09:40:44.976,1.339939,-0.649215,1,2023-01-01 10:13:19.585,1,0.930479
...,...,...,...,...,...,...,...,...
2470395,"(10, 2023-01-01 09:15:49.960000)",2023-01-01 09:15:49.960,0.468220,-0.887795,10,2023-01-01 09:15:49.960,1,1.053155
2470396,"(10, 2023-01-01 09:15:49.960000)",2023-01-01 09:15:49.960,0.385593,-0.818898,10,2023-01-01 09:15:49.960,1,1.053155
2470397,"(10, 2023-01-01 09:15:49.960000)",2023-01-01 09:15:49.960,0.048729,-1.403543,10,2023-01-01 09:15:49.960,1,1.053155
2470398,"(10, 2023-01-01 09:15:49.960000)",2023-01-01 09:15:49.960,0.855932,-1.220472,10,2023-01-01 09:15:49.960,1,1.053155


In [10]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Index: 235810560 entries, 0 to 2470399
Data columns (total 8 columns):
 #   Column            Dtype         
---  ------            -----         
 0   id                object        
 1   time              datetime64[ns]
 2   x                 float64       
 3   y                 float64       
 4   bearing_id        int64         
 5   time_end          datetime64[ns]
 6   RUL_rolled_class  int64         
 7   RUL_rolled        float64       
dtypes: datetime64[ns](2), float64(3), int64(2), object(1)
memory usage: 15.8+ GB
