In [6]:
import numpy as np
import pandas as pd
import tsfresh
from tsfresh.utilities.dataframe_functions import impute

import problem
from problem import turn_prediction_to_event_list

data = problem.get_train_data(path="../")
X_train: pd.DataFrame = data[0]
y_train: pd.DataFrame = data[1]

events = turn_prediction_to_event_list(y_train)


def extract_located_area(event_index, delta=70):
    start = pd.to_datetime(events[event_index].begin)
    end = pd.to_datetime(events[event_index].end)
    X_df = X_train[(start - pd.Timedelta(hours=delta)):(end + pd.Timedelta(hours=delta))]
    y = y_train[(start - pd.Timedelta(hours=delta)):(end + pd.Timedelta(hours=delta))]
    X_df['time'] = X_df.index.values.astype(float)
    X_df['id'] = np.arange(X_df.shape[0])
    return X_df, y


def search_features(event_index, delta=70):
    print(f"[Feature Extraction] Looking for features at event {event_index} with delta {delta}")
    X_df, y = extract_located_area(event_index, delta)
    print("                      * Entries shape: ", X_df.shape, y.shape)
    print(f"                     * Window size is {X_df.shape[0]}")
    print(f"                     * Extracting features...")
    extracted_features = tsfresh.extract_features(X_df, column_id='id', column_sort='time')
    print(f"                     * Imputing values...")
    impute(extracted_features)
    y = y.to_numpy()
    print(f"                     * Selecting features with shape: {extracted_features.shape}, {y.shape}")
    features_filtered = tsfresh.select_features(extracted_features, y)
    return features_filtered

def search_all_features():
    indexes = np.arange(X_train.shape[0])

    X_df = X_train.copy(deep=True)
    X_df['time'] = X_df.index.values.astype(float)
    X_df['id'] = indexes
    y = y_train.copy(deep=True)
    y = y.reindex(indexes)
    return tsfresh.extract_relevant_features(X_df, y, column_id='id', column_sort='time')

In [7]:
features = search_features(20, delta=80)
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['time'] = X_df.index.values.astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['id'] = np.arange(X_df.shape[0])


[Feature Extraction] Looking for features at event 20 with delta 80
                      * Entries shape:  (1178, 35) (1178,)
                     * Window size is 1178
                     * Extracting features...


Feature Extraction: 100%|██████████| 20/20 [03:38<00:00, 10.91s/it]


                     * Imputing values...


 'Vx__mean_second_derivative_central' ...
 'Vth__permutation_entropy__dimension_7__tau_1'
 'Vth__query_similarity_count__query_None__threshold_0.0'
 'Vth__mean_n_absolute_max__number_of_maxima_7'] did not have any finite values. Filling with zeros.


                     * Selecting features with shape: (1178, 26004), (1178,)


Unnamed: 0,Beta__root_mean_square,"Beta__fft_coefficient__attr_""abs""__coeff_0","Beta__fft_coefficient__attr_""real""__coeff_0","Beta__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)","Beta__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)","Beta__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)","Beta__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)",Beta__quantile__q_0.9,Beta__quantile__q_0.8,Beta__quantile__q_0.7,...,Bz__benford_correlation,Bz__count_above__t_0,Bz__range_count__max_1000000000000.0__min_0,Bz__range_count__max_0__min_-1000000000000.0,"Bz__fft_coefficient__attr_""angle""__coeff_0",Bz__count_below__t_0,By__range_count__max_1__min_-1,Bx__benford_correlation,Vz__range_count__max_1__min_-1,Beta__benford_correlation
0,0.111992,0.111992,0.111992,0.021720,0.030716,0.043439,0.068683,0.111992,0.111992,0.111992,...,-0.200946,0.0,0.0,1.0,180.0,1.0,0.0,0.295657,0.0,0.864123
1,0.137842,0.137842,0.137842,0.026733,0.037806,0.053466,0.084537,0.137842,0.137842,0.137842,...,-0.200946,0.0,0.0,1.0,180.0,1.0,0.0,0.295657,1.0,0.864123
2,0.102126,0.102126,0.102126,0.019806,0.028010,0.039613,0.062633,0.102126,0.102126,0.102126,...,-0.200946,0.0,0.0,1.0,180.0,1.0,0.0,0.295657,0.0,0.864123
3,0.134096,0.134096,0.134096,0.026006,0.036779,0.052013,0.082240,0.134096,0.134096,0.134096,...,-0.200946,0.0,0.0,1.0,180.0,1.0,0.0,0.062915,0.0,0.864123
4,0.166510,0.166510,0.166510,0.032293,0.045669,0.064586,0.102119,0.166510,0.166510,0.166510,...,-0.200946,0.0,0.0,1.0,180.0,1.0,0.0,0.062915,0.0,0.864123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1173,10.444804,10.444804,10.444804,2.025663,2.864720,4.051326,6.405709,10.444804,10.444804,10.444804,...,0.864123,0.0,0.0,1.0,180.0,1.0,1.0,0.062915,0.0,0.864123
1174,5.638346,5.638346,5.638346,1.093500,1.546442,2.186999,3.457949,5.638346,5.638346,5.638346,...,0.864123,0.0,0.0,1.0,180.0,1.0,1.0,0.062915,0.0,-0.145280
1175,7.346657,7.346657,7.346657,1.424809,2.014984,2.849618,4.505642,7.346657,7.346657,7.346657,...,-0.297356,0.0,0.0,1.0,180.0,1.0,1.0,0.295657,0.0,-0.241690
1176,13.650851,13.650851,13.650851,2.647443,3.744050,5.294886,8.371950,13.650851,13.650851,13.650851,...,-0.064614,0.0,0.0,1.0,180.0,1.0,1.0,-0.064614,0.0,0.864123


In [8]:
features = search_features(100, delta=120)
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['time'] = X_df.index.values.astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['id'] = np.arange(X_df.shape[0])


[Feature Extraction] Looking for features at event 100 with delta 120
                      * Entries shape:  (1188, 35) (1188,)
                     * Window size is 1188
                     * Extracting features...


Feature Extraction: 100%|██████████| 20/20 [03:49<00:00, 11.47s/it]


                     * Imputing values...


 ... 'RmsBob__permutation_entropy__dimension_7__tau_1'
 'RmsBob__query_similarity_count__query_None__threshold_0.0'
 'RmsBob__mean_n_absolute_max__number_of_maxima_7'] did not have any finite values. Filling with zeros.


                     * Selecting features with shape: (1188, 26004), (1188,)


Unnamed: 0,Vth__quantile__q_0.7,"Vth__fft_coefficient__attr_""abs""__coeff_0","Vth__fft_coefficient__attr_""real""__coeff_0","Vth__cwt_coefficients__coeff_0__w_20__widths_(2, 5, 10, 20)","Vth__cwt_coefficients__coeff_0__w_10__widths_(2, 5, 10, 20)","Vth__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)","Vth__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)",Vth__quantile__q_0.9,Vth__sum_values,Vth__abs_energy,...,V__quantile__q_0.2,V__quantile__q_0.3,V__quantile__q_0.1,V__quantile__q_0.6,V__quantile__q_0.7,V__quantile__q_0.8,V__quantile__q_0.9,V__quantile__q_0.4,Range F 13__benford_correlation,Vy__benford_correlation
0,44.461941,44.461941,44.461941,8.622939,12.194677,17.245878,27.268128,44.461941,44.461941,1976.864136,...,442.988647,442.988647,442.988647,442.988647,442.988647,442.988647,442.988647,442.988647,0.864123,-0.200946
1,44.519333,44.519333,44.519333,8.634070,12.210418,17.268139,27.303326,44.519333,44.519333,1981.970947,...,433.219421,433.219421,433.219421,433.219421,433.219421,433.219421,433.219421,433.219421,0.864123,-0.064614
2,44.155334,44.155334,44.155334,8.563476,12.110584,17.126952,27.080089,44.155334,44.155334,1949.693604,...,439.941803,439.941803,439.941803,439.941803,439.941803,439.941803,439.941803,439.941803,0.864123,0.864123
3,49.327442,49.327442,49.327442,9.566553,13.529150,19.133107,30.252098,49.327442,49.327442,2433.196533,...,463.838531,463.838531,463.838531,463.838531,463.838531,463.838531,463.838531,463.838531,0.864123,0.864123
4,47.389927,47.389927,47.389927,9.190792,12.997743,18.381584,29.063836,47.389927,47.389927,2245.805176,...,451.173004,451.173004,451.173004,451.173004,451.173004,451.173004,451.173004,451.173004,0.864123,0.295657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1183,80.076904,80.076904,80.076904,15.530097,21.962874,31.060195,49.110480,80.076904,80.076904,6412.310547,...,538.879089,538.879089,538.879089,538.879089,538.879089,538.879089,538.879089,538.879089,0.295657,0.062915
1184,73.262001,73.262001,73.262001,14.208416,20.093735,28.416833,44.930958,73.262001,73.262001,5367.320801,...,530.761658,530.761658,530.761658,530.761658,530.761658,530.761658,530.761658,530.761658,0.295657,-0.064614
1185,78.103531,78.103531,78.103531,15.147382,21.421633,30.294763,47.900227,78.103531,78.103531,6100.161621,...,538.238892,538.238892,538.238892,538.238892,538.238892,538.238892,538.238892,538.238892,0.295657,0.062915
1186,75.804077,75.804077,75.804077,14.701426,20.790956,29.402852,46.489992,75.804077,75.804077,5746.258301,...,549.867981,549.867981,549.867981,549.867981,549.867981,549.867981,549.867981,549.867981,0.062915,0.295657


In [9]:
features = search_features(400, delta=120)
features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['time'] = X_df.index.values.astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_df['id'] = np.arange(X_df.shape[0])


[Feature Extraction] Looking for features at event 400 with delta 120
                      * Entries shape:  (1548, 35) (1548,)
                     * Window size is 1548
                     * Extracting features...


Feature Extraction: 100%|██████████| 20/20 [04:55<00:00, 14.79s/it]


                     * Imputing values...


 'Range F 12__mean_second_derivative_central' ...
 'Range F 11__permutation_entropy__dimension_7__tau_1'
 'Range F 11__query_similarity_count__query_None__threshold_0.0'
 'Range F 11__mean_n_absolute_max__number_of_maxima_7'] did not have any finite values. Filling with zeros.


                     * Selecting features with shape: (1548, 26004), (1548,)


Unnamed: 0,B__median,B__sum_values,B__abs_energy,B__mean,B__root_mean_square,B__maximum,B__absolute_maximum,B__quantile__q_0.1,B__quantile__q_0.2,B__quantile__q_0.3,...,"Bz__cwt_coefficients__coeff_0__w_2__widths_(2, 5, 10, 20)",Bz__quantile__q_0.9,Bz__quantile__q_0.8,Bz__quantile__q_0.7,Bz__quantile__q_0.6,Bz__quantile__q_0.2,Bz__mean,Bz__quantile__q_0.1,"Bz__cwt_coefficients__coeff_0__w_5__widths_(2, 5, 10, 20)",Bz_rms__range_count__max_1__min_-1
0,6.536698,6.536698,42.728420,6.536698,6.536698,6.536698,6.536698,6.536698,6.536698,6.536698,...,-2.232242,-3.639773,-3.639773,-3.639773,-3.639773,-3.639773,-3.639773,-3.639773,-1.411794,1.0
1,6.370321,6.370321,40.580986,6.370321,6.370321,6.370321,6.370321,6.370321,6.370321,6.370321,...,-1.739001,-2.835521,-2.835521,-2.835521,-2.835521,-2.835521,-2.835521,-2.835521,-1.099841,1.0
2,6.229449,6.229449,38.806038,6.229449,6.229449,6.229449,6.229449,6.229449,6.229449,6.229449,...,-1.580474,-2.577036,-2.577036,-2.577036,-2.577036,-2.577036,-2.577036,-2.577036,-0.999580,1.0
3,6.367143,6.367143,40.540504,6.367143,6.367143,6.367143,6.367143,6.367143,6.367143,6.367143,...,-1.023580,-1.668994,-1.668994,-1.668994,-1.668994,-1.668994,-1.668994,-1.668994,-0.647369,1.0
4,6.425351,6.425351,41.285137,6.425351,6.425351,6.425351,6.425351,6.425351,6.425351,6.425351,...,-1.156613,-1.885910,-1.885910,-1.885910,-1.885910,-1.885910,-1.885910,-1.885910,-0.731506,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1543,4.049331,4.049331,16.397083,4.049331,4.049331,4.049331,4.049331,4.049331,4.049331,4.049331,...,1.979549,3.227747,3.227747,3.227747,3.227747,3.227747,3.227747,3.227747,1.251977,1.0
1544,4.047310,4.047310,16.380720,4.047310,4.047310,4.047310,4.047310,4.047310,4.047310,4.047310,...,1.696819,2.766742,2.766742,2.766742,2.766742,2.766742,2.766742,2.766742,1.073163,1.0
1545,3.940410,3.940410,15.526829,3.940410,3.940410,3.940410,3.940410,3.940410,3.940410,3.940410,...,1.789221,2.917408,2.917408,2.917408,2.917408,2.917408,2.917408,2.917408,1.131603,1.0
1546,4.124222,4.124222,17.009209,4.124222,4.124222,4.124222,4.124222,4.124222,4.124222,4.124222,...,1.627174,2.653182,2.653182,2.653182,2.653182,2.653182,2.653182,2.653182,1.029115,1.0


# Feature selection

In [3]:
import problem
import utils
import pandas as pd
from sklearn.feature_selection import RFECV
from sklearn.model_selection import TimeSeriesSplit
import sklearn.preprocessing as preprocessing
from sklearn.pipeline import make_pipeline

from sklearn.ensemble import RandomForestClassifier

def get_preprocessing():
    return preprocessing.QuantileTransformer(n_quantiles=100, output_distribution='normal', random_state=1).set_output(transform='pandas'), \
           preprocessing.RobustScaler().set_output(transform='pandas'), \
           preprocessing.MinMaxScaler().set_output(transform='pandas')

def pipeline_preprocessing(X):
    X_df = X.copy()
    pipe = make_pipeline(*get_preprocessing())
    pipe.fit(X_df)
    return pipe.transform(X_df)

data = problem.get_train_data(path="../")
X_train: pd.DataFrame = data[0]
y_train: pd.DataFrame = data[1]
X_train.ffill().bfill()

fe = utils.FeatureExtractor()
X_transformed = fe.transform(pipeline_preprocessing(X_train))

min_features_to_select = 1  # Minimum number of features to consider
classifier = RandomForestClassifier(n_estimators=50, class_weight='balanced')
cv = TimeSeriesSplit(5)

rfecv = RFECV(
    estimator=classifier,
    step=1,
    cv=cv,
    scoring="accuracy",
    min_features_to_select=min_features_to_select,
    n_jobs=2,
)

rfecv.fit(X_transformed, y_train)

print(f"Optimal number of features: {rfecv.n_features_}")

               [*] Preprocessing data
               - Counting peaks and height
               - Rolling variance
               - Rolling min
               - Rolling max
               - FFT
               - CWT
               - Rolling quantile
               - Rolling energy
               - Rolling median
               - Rolling entropy


  pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)


               - Time lags


  X_df[name] = shifted
  X_df[name] = shifted
  X_df[name] = shifted


ValueError: Input X contains NaN.
RFECV does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values