tsfresh returns a great number of features. Depending on the dynamics of the inspected time series - some of them may be highly correlated. A common technique to deal with highly correlated features are transformations such as PCA. 

In [1]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [5]:
class PCAForPandas(PCA):
    """This class is just a small wrapper around the PCA estimator of sklearn including normalization to make it 
    compatible with pandas DataFrames.
    """

    def __init__(self, **kwargs):
        self._z_scaler = StandardScaler()
        super(self.__class__, self).__init__(**kwargs)

        self._X_columns = None

    def fit(self, X, y=None):
        """Normalize X and call the fit method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        self._z_scaler.fit(X.values, y)
        z_data = self._z_scaler.transform(X.values, y)

        return super(self.__class__, self).fit(z_data, y)

    def fit_transform(self, X, y=None):
        """Call the fit and the transform method of this class."""

        X = self._prepare(X)

        self.fit(X, y)
        return self.transform(X, y)

    def transform(self, X, y=None):
        """Normalize X and call the transform method of the base class with numpy arrays instead of pandas data frames."""

        X = self._prepare(X)

        z_data = self._z_scaler.transform(X.values, y)

        transformed_ndarray = super(self.__class__, self).transform(z_data)

        pandas_df = pd.DataFrame(transformed_ndarray)
        pandas_df.columns = ["pca_{}".format(i) for i in range(len(pandas_df.columns))]

        return pandas_df

    def _prepare(self, X):
        """Check if the data is a pandas DataFrame and sorts the column names.

        :raise AttributeError: if pandas is not a DataFrame or the columns of the new X is not compatible with the 
                               columns from the previous X data
        """
        if not isinstance(X, pd.DataFrame):
            raise AttributeError("X is not a pandas DataFrame")

        X.sort_index(axis=1, inplace=True)

        if self._X_columns is not None:
            if self._X_columns != list(X.columns):
                raise AttributeError("The columns of the new X is not compatible with the columns from the previous X data")
        else:
            self._X_columns = list(X.columns)

        return X

So what the above code does simply is to combine a standard scaler with pca transformer - ensuring the data is standardized before applying pca - all in one transformation

**Load robot failute example:**
- Split the data set in a train (1 <= id <= 87) and test set (87 <= id <= 88). It is assumed that the selection process is done in the past (train) and features for future test datasets should be determined. 

In [6]:
from tsfresh.examples.robot_execution_failures import download_robot_execution_failures, load_robot_execution_failures
from tsfresh.feature_extraction import extract_features
from tsfresh.feature_selection import select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters, MinimalFCParameters, settings

In [8]:
download_robot_execution_failures()

df, y = load_robot_execution_failures()

In [11]:
df_train = df.iloc[(df.id <= 87).values] 
y_train = y[0:-1]

df_test = df.iloc[(df.id >= 87).values]
y_test = y[-2:]

df.head()

Unnamed: 0,id,time,F_x,F_y,F_z,T_x,T_y,T_z
0,1,0,-1,-1,63,-3,-1,0
1,1,1,0,0,62,-3,-1,0
2,1,2,-1,-1,61,-3,0,0
3,1,3,-1,-1,63,-2,-1,0
4,1,4,-1,-1,63,-3,-1,0


**Train:**
- Extract train features:

In [12]:
X_train = extract_features(df_train, column_id = "id", column_sort="time", default_fc_parameters = MinimalFCParameters(),
                            impute_function=impute)

Feature Extraction: 100%|██████████| 38/38 [00:02<00:00, 16.81it/s]


In [13]:
X_train.head()

Unnamed: 0,F_x__sum_values,F_x__median,F_x__mean,F_x__length,F_x__standard_deviation,F_x__variance,F_x__root_mean_square,F_x__maximum,F_x__absolute_maximum,F_x__minimum,F_y__sum_values,F_y__median,F_y__mean,F_y__length,F_y__standard_deviation,F_y__variance,F_y__root_mean_square,F_y__maximum,F_y__absolute_maximum,F_y__minimum,F_z__sum_values,F_z__median,F_z__mean,F_z__length,F_z__standard_deviation,F_z__variance,F_z__root_mean_square,F_z__maximum,F_z__absolute_maximum,F_z__minimum,T_x__sum_values,T_x__median,T_x__mean,T_x__length,T_x__standard_deviation,T_x__variance,T_x__root_mean_square,T_x__maximum,T_x__absolute_maximum,T_x__minimum,T_y__sum_values,T_y__median,T_y__mean,T_y__length,T_y__standard_deviation,T_y__variance,T_y__root_mean_square,T_y__maximum,T_y__absolute_maximum,T_y__minimum,T_z__sum_values,T_z__median,T_z__mean,T_z__length,T_z__standard_deviation,T_z__variance,T_z__root_mean_square,T_z__maximum,T_z__absolute_maximum,T_z__minimum
1,-14.0,-1.0,-0.933333,15.0,0.249444,0.062222,0.966092,0.0,1.0,-1.0,-13.0,-1.0,-0.866667,15.0,0.339935,0.115556,0.930949,0.0,1.0,-1.0,938.0,63.0,62.533333,15.0,1.203698,1.448889,62.544917,64.0,64.0,60.0,-43.0,-3.0,-2.866667,15.0,0.339935,0.115556,2.886751,-2.0,3.0,-3.0,-10.0,-1.0,-0.666667,15.0,0.471405,0.222222,0.816497,0.0,1.0,-1.0,0.0,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-13.0,-1.0,-0.866667,15.0,0.956847,0.915556,1.290994,0.0,3.0,-3.0,-10.0,-1.0,-0.666667,15.0,2.149935,4.622222,2.250926,3.0,4.0,-4.0,932.0,63.0,62.133333,15.0,4.333846,18.782222,62.284294,70.0,70.0,53.0,-53.0,-3.0,-3.533333,15.0,3.422799,11.715556,4.91935,1.0,10.0,-10.0,-20.0,-1.0,-1.333333,15.0,2.054805,4.222222,2.44949,4.0,5.0,-5.0,-4.0,0.0,-0.266667,15.0,0.442217,0.195556,0.516398,0.0,1.0,-1.0
3,-10.0,-1.0,-0.666667,15.0,0.596285,0.355556,0.894427,1.0,1.0,-1.0,-8.0,0.0,-0.533333,15.0,1.543445,2.382222,1.632993,2.0,3.0,-3.0,917.0,61.0,61.133333,15.0,4.616877,21.315556,61.307422,68.0,68.0,51.0,-60.0,-5.0,-4.0,15.0,2.633122,6.933333,4.788876,3.0,7.0,-7.0,-29.0,-2.0,-1.933333,15.0,1.768867,3.128889,2.620433,1.0,5.0,-5.0,-4.0,0.0,-0.266667,15.0,0.442217,0.195556,0.516398,0.0,1.0,-1.0
4,-6.0,0.0,-0.4,15.0,0.95219,0.906667,1.032796,1.0,2.0,-2.0,2.0,1.0,0.133333,15.0,1.995551,3.982222,2.0,5.0,5.0,-3.0,933.0,63.0,62.2,15.0,3.833188,14.693333,62.318002,70.0,70.0,56.0,-93.0,-6.0,-6.2,15.0,3.525148,12.426667,7.132087,-1.0,15.0,-15.0,-16.0,-1.0,-1.066667,15.0,2.669998,7.128889,2.875181,4.0,6.0,-6.0,-5.0,0.0,-0.333333,15.0,0.596285,0.355556,0.68313,1.0,1.0,-1.0
5,-9.0,-1.0,-0.6,15.0,0.879394,0.773333,1.064581,2.0,2.0,-2.0,-4.0,0.0,-0.266667,15.0,1.730767,2.995556,1.75119,3.0,3.0,-3.0,909.0,59.0,60.6,15.0,4.841487,23.44,60.793092,73.0,73.0,56.0,-105.0,-8.0,-7.0,15.0,2.75681,7.6,7.523297,-2.0,12.0,-12.0,-42.0,-3.0,-2.8,15.0,2.039608,4.16,3.464102,3.0,5.0,-5.0,-2.0,0.0,-0.133333,15.0,0.618241,0.382222,0.632456,1.0,1.0,-1.0


Select the training features:

In [14]:
X_train_filtered = select_features(X_train, y_train)
X_train_filtered.tail()

Unnamed: 0,F_x__root_mean_square,F_y__root_mean_square,T_y__standard_deviation,T_y__variance,T_y__absolute_maximum,F_y__absolute_maximum,T_x__absolute_maximum,F_x__absolute_maximum,T_y__root_mean_square,F_z__standard_deviation,F_z__variance,F_x__variance,F_x__standard_deviation,T_x__variance,T_x__standard_deviation,F_y__standard_deviation,F_y__variance,T_x__root_mean_square,T_z__root_mean_square,T_z__variance,T_z__standard_deviation,F_z__minimum,T_z__absolute_maximum,F_z__sum_values,F_z__mean,F_z__median,F_y__maximum,F_z__absolute_maximum,F_x__minimum,F_x__maximum,T_x__minimum,T_z__minimum,T_y__minimum,T_z__maximum,F_z__maximum,F_z__root_mean_square
83,19.733221,10.957494,7.190735,51.706667,23.0,15.0,172.0,28.0,16.095548,51.26645,2628.248889,28.4,5.329165,1058.728889,32.538114,2.205045,4.862222,110.180761,12.772366,4.373333,2.091252,-181.0,16.0,-1103.0,-73.533333,-53.0,-8.0,181.0,-28.0,-14.0,70.0,-16.0,-23.0,-10.0,-24.0,89.640393
84,80.346334,53.40412,39.541483,1563.528889,167.0,83.0,410.0,110.0,106.852234,291.988082,85257.04,1338.515556,36.585729,6875.848889,82.920739,33.816498,1143.555556,348.864539,18.235497,93.315556,9.659998,-1018.0,28.0,-10671.0,-711.4,-912.0,83.0,1018.0,-110.0,-25.0,180.0,-28.0,12.0,0.0,-208.0,768.990897
85,10.59245,10.076375,3.841296,14.755556,14.0,15.0,46.0,19.0,5.790797,14.501494,210.293333,21.315556,4.616877,40.995556,6.402777,2.844097,8.088889,34.663141,4.082483,4.648889,2.156128,2.0,7.0,423.0,28.2,32.0,15.0,50.0,4.0,19.0,-46.0,-7.0,-1.0,0.0,50.0,31.710146
86,74.608757,37.473546,52.807154,2788.595556,191.0,69.0,95.0,148.0,88.699117,121.420189,14742.862222,1461.928889,38.235179,202.426667,14.227673,16.041058,257.315556,67.320626,7.681146,29.84,5.4626,-411.0,10.0,-2216.0,-147.733333,-110.0,69.0,411.0,21.0,148.0,-95.0,-10.0,14.0,8.0,-14.0,191.227613
87,306.097697,143.447551,80.098162,6415.715556,471.0,162.0,142.0,342.0,402.516666,204.966621,42011.315556,3335.44,57.753268,70.995556,8.425886,23.75673,564.382222,128.343549,33.179311,98.088889,9.903983,-1145.0,44.0,-14137.0,-942.466667,-1036.0,162.0,1145.0,171.0,342.0,-142.0,13.0,222.0,44.0,-486.0,964.49714


**Principal Component Analysis on train features:**

In [15]:
pca_train = PCAForPandas(n_components = 4)
X_train_pca = pca_train.fit_transform(X_train_filtered)

In [17]:
X_train_pca

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
0,-3.794150,-0.058952,0.111094,-0.483540
1,-3.657043,-0.018750,0.086160,-0.436398
2,-3.691986,-0.016013,0.096910,-0.458632
3,-3.628503,0.005231,0.081372,-0.447598
4,-3.652309,0.002591,0.080404,-0.455145
...,...,...,...,...
82,-2.037236,-0.525795,0.066420,-0.546658
83,5.078151,-3.366748,0.258651,0.941587
84,-3.065568,-0.129087,-0.255740,-0.398759
85,0.579472,-1.003404,-2.773311,1.490172


In [18]:
# add index plus 1 to keep original index from robot example
X_train_pca.index += 1
X_train_pca.tail()

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
83,-2.037236,-0.525795,0.06642,-0.546658
84,5.078151,-3.366748,0.258651,0.941587
85,-3.065568,-0.129087,-0.25574,-0.398759
86,0.579472,-1.003404,-2.773311,1.490172
87,8.520847,-5.992067,-6.476718,0.886232


**test:**
- Extract test features
- Only the selected features from the train data are extracted

In [19]:
X_test_filtered = extract_features(df_test, column_id = "id", column_sort="time",
                                    kind_to_fc_parameters = settings.from_columns(X_train_filtered.columns),
                                    impute_function=impute)

Feature Extraction: 100%|██████████| 12/12 [00:02<00:00,  5.23it/s]


In [20]:
X_test_filtered

Unnamed: 0,F_x__absolute_maximum,F_x__maximum,F_x__minimum,F_x__root_mean_square,F_x__standard_deviation,F_x__variance,F_y__absolute_maximum,F_y__maximum,F_y__root_mean_square,F_y__standard_deviation,F_y__variance,T_x__absolute_maximum,T_x__minimum,T_x__root_mean_square,T_x__standard_deviation,T_x__variance,F_z__absolute_maximum,F_z__maximum,F_z__mean,F_z__median,F_z__minimum,F_z__root_mean_square,F_z__standard_deviation,F_z__sum_values,F_z__variance,T_y__absolute_maximum,T_y__minimum,T_y__root_mean_square,T_y__standard_deviation,T_y__variance,T_z__absolute_maximum,T_z__maximum,T_z__minimum,T_z__root_mean_square,T_z__standard_deviation,T_z__variance
87,342.0,342.0,171.0,306.097697,57.753268,3335.44,162.0,162.0,143.447551,23.75673,564.382222,142.0,-142.0,128.343549,8.425886,70.995556,1145.0,-486.0,-942.466667,-1036.0,-1145.0,964.49714,204.966621,-14137.0,42011.315556,471.0,222.0,402.516666,80.098162,6415.715556,44.0,44.0,13.0,33.179311,9.903983,98.088889
88,13.0,-6.0,-13.0,9.753632,2.061283,4.248889,5.0,5.0,2.744692,1.203698,1.448889,29.0,-29.0,20.668817,4.057366,16.462222,53.0,53.0,40.0,42.0,15.0,41.387599,10.62701,600.0,112.933333,27.0,-27.0,22.55364,2.628054,6.906667,6.0,6.0,3.0,4.946379,0.884433,0.782222


PCA on test features:

In [21]:
X_test_pca = pca_train.transform(X_test_filtered)

In [22]:
X_test_pca.index = [87,88]

In [23]:
X_test_pca

Unnamed: 0,pca_0,pca_1,pca_2,pca_3
87,8.520847,-5.992067,-6.476718,0.886232
88,-3.246681,-0.023826,-0.028257,-0.696021
