# Feature extraction with `tsfresh` transformer

From: https://www.sktime.org/en/latest/examples/feature_extraction_with_tsfresh.html

## Overview

In this tutorial, we show how you can use sktime with `tsfresh` to first extract features from time series, so that we can then use any `scikit-learn` estimator.

### Preliminaries

You have to install tsfresh if you haven’t already. To install it, uncomment the cell below:

```python
!pip install --upgrade tsfresh
```

In [1]:
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

from sktime.datasets import load_arrow_head, load_basic_motions

from sktime.transformations.panel.tsfresh import TSFreshFeatureExtractor

### Univariate time series classification data

For more details on the data set, see the [univariate time series classification notebook](https://github.com/alan-turing-institute/sktime/blob/main/examples/02_classification_univariate.ipynb).

In [2]:
X, y = load_arrow_head(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(158, 1) (158,) (53, 1) (53,)


In [3]:
X_train.head()

Unnamed: 0,dim_0
4,0 -1.9591 1 -1.9749 2 -1.9714 3 ...
76,0 -1.8888 1 -1.8850 2 -1.8562 3 ...
16,0 -0.79626 1 -0.77368 2 -0.66440 3...
39,0 -2.0226 1 -1.9880 2 -1.9542 3 ...
132,0 -1.8902 1 -1.9055 2 -1.8857 3 ...


In [4]:
#  binary classification task
np.unique(y_train)

array(['0', '1', '2'], dtype=object)

### Using `tsfresh` to extract features

In [5]:
t = TSFreshFeatureExtractor(
    default_fc_parameters="efficient", 
    show_warnings=False
)
Xt = t.fit_transform(X_train)
Xt.head()

  warn(
Feature Extraction: 100%|██████████| 5/5 [00:08<00:00,  1.64s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_0__permutation_entropy__dimension_5__tau_1,dim_0__permutation_entropy__dimension_6__tau_1,dim_0__permutation_entropy__dimension_7__tau_1,dim_0__query_similarity_count__query_None__threshold_0.0,"dim_0__matrix_profile__feature_""min""__threshold_0.98","dim_0__matrix_profile__feature_""max""__threshold_0.98","dim_0__matrix_profile__feature_""mean""__threshold_0.98","dim_0__matrix_profile__feature_""median""__threshold_0.98","dim_0__matrix_profile__feature_""25""__threshold_0.98","dim_0__matrix_profile__feature_""75""__threshold_0.98"
0,0.0,0.0,0.0,1.0,0.000186,250.000892,0.065638,0.000113,2e-05,0.027825,...,1.92475,2.245908,2.527543,0.0,1.975874,11.787011,8.242918,10.17154,4.843454,10.785606
1,0.0,1.0,0.0,1.0,8.9e-05,249.999746,0.05116,3e-06,-9e-06,-0.043601,...,2.111513,2.477136,2.752474,0.0,2.162107,10.697445,5.407818,6.103876,3.249555,6.880292
2,0.0,0.0,0.0,1.0,0.000194,250.000369,0.080719,-0.000285,8.9e-05,-0.17086,...,1.911292,2.243508,2.545105,0.0,1.69561,5.51331,2.692293,2.640465,2.226668,3.049132
3,0.0,0.0,0.0,1.0,-0.000234,249.99909,0.06785,-0.000106,-0.000221,0.089442,...,2.692314,3.090202,3.348541,0.0,1.85966,7.461691,4.893533,4.848071,3.441674,6.764592
4,0.0,0.0,0.0,1.0,0.000449,250.0001,0.049964,-6.9e-05,2.6e-05,0.27091,...,2.165529,2.462872,2.697619,0.0,1.927516,11.606845,5.5413,5.485729,3.08439,7.281704


In [6]:
Xt.shape

(158, 781)

### Using `tsfresh` with `sktime`

In [7]:
classifier = make_pipeline(
    TSFreshFeatureExtractor(default_fc_parameters="efficient", show_warnings=False),
    RandomForestClassifier(),
)

In [8]:
classifier.fit(X_train, y_train)
classifier.score(X_test, y_test)

  warn(
Feature Extraction: 100%|██████████| 5/5 [00:08<00:00,  1.68s/it]
  warn(
Feature Extraction: 100%|██████████| 5/5 [00:02<00:00,  1.72it/s]


0.8867924528301887

### Multivariate time series classification data

In [9]:
X, y = load_basic_motions(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [10]:
#  multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
36,0 -1.801504 1 -1.801504 2 -0.480725 3...,0 2.344990 1 2.344990 2 -0.994385 3...,0 0.281253 1 0.281253 2 0.378807 3...,0 0.716447 1 0.716447 2 -0.870923 3...,0 0.162466 1 0.162466 2 0.095881 3...,0 0.921527 1 0.921527 2 -0.474080 3...
17,0 0.324449 1 0.324449 2 9.29442...,0 -0.977516 1 -0.977516 2 -6.96322...,0 -1.260218 1 -1.260218 2 -2.498493 3...,0 -0.788358 1 -0.788358 2 2.434323 3...,0 0.316941 1 0.316941 2 -0.079901 3...,0 0.588605 1 0.588605 2 6.535916 3...
34,0 0.140313 1 0.140313 2 0.903629 3...,0 -0.604627 1 -0.604627 2 1.621493 3...,0 -0.221660 1 -0.221660 2 0.486719 3...,0 0.079901 1 0.079901 2 0.420813 3...,0 -0.085228 1 -0.085228 2 -0.428803 3...,0 -0.010653 1 -0.010653 2 1.171884 3...
9,0 0.126160 1 0.126160 2 1.771871 3...,0 0.102733 1 0.102733 2 -3.798484 3...,0 0.308964 1 0.308964 2 0.141369 3...,0 0.002663 1 0.002663 2 -1.427568 3...,0 0.000000 1 0.000000 2 -0.167792 3...,0 -0.007990 1 -0.007990 2 -1.643301 3...
0,0 -0.740653 1 -0.740653 2 10.20844...,0 0.756509 1 0.756509 2 -9.216970 3...,0 -0.275809 1 -0.275809 2 -12.37890...,0 -0.423476 1 -0.423476 2 -14.69915...,0 0.013317 1 0.013317 2 4.578337 3...,0 0.013317 1 0.013317 2 -5.055081 3...


In [11]:
t = TSFreshFeatureExtractor(
    default_fc_parameters="efficient", 
    show_warnings=False
)

Xt = t.fit_transform(X_train)
Xt.head()

  warn(
Feature Extraction: 100%|██████████| 5/5 [00:14<00:00,  2.91s/it]


Unnamed: 0,dim_0__variance_larger_than_standard_deviation,dim_0__has_duplicate_max,dim_0__has_duplicate_min,dim_0__has_duplicate,dim_0__sum_values,dim_0__abs_energy,dim_0__mean_abs_change,dim_0__mean_change,dim_0__mean_second_derivative_central,dim_0__median,...,dim_5__permutation_entropy__dimension_5__tau_1,dim_5__permutation_entropy__dimension_6__tau_1,dim_5__permutation_entropy__dimension_7__tau_1,dim_5__query_similarity_count__query_None__threshold_0.0,"dim_5__matrix_profile__feature_""min""__threshold_0.98","dim_5__matrix_profile__feature_""max""__threshold_0.98","dim_5__matrix_profile__feature_""mean""__threshold_0.98","dim_5__matrix_profile__feature_""median""__threshold_0.98","dim_5__matrix_profile__feature_""25""__threshold_0.98","dim_5__matrix_profile__feature_""75""__threshold_0.98"
0,1.0,0.0,0.0,1.0,412.62596,5716.535296,3.523823,0.022801,0.0,2.097393,...,3.55118,4.101222,4.395817,0.0,0.736316,2.514378,1.432725,1.437321,1.159282,1.602725
1,1.0,0.0,0.0,1.0,505.902373,13876.020277,7.217855,-0.174782,-0.087916,9.463268,...,2.85599,3.382986,3.763514,0.0,1.103726,2.122494,1.653955,1.657515,1.502991,1.849793
2,1.0,0.0,0.0,1.0,448.430964,6047.333821,3.368293,-0.061216,-0.051385,2.845903,...,3.672477,4.201724,4.434494,0.0,0.751989,2.72303,1.734892,1.799675,1.453356,2.120879
3,0.0,0.0,0.0,1.0,-19.802918,9.735453,0.134203,-0.003656,-0.000147,-0.248964,...,2.858259,3.484553,3.998553,0.0,1.14752,2.101457,1.570164,1.56595,1.415515,1.695546
4,1.0,0.0,1.0,1.0,-2.214234,117.736948,0.326184,0.00458,0.0,-0.161695,...,3.301469,3.951719,4.288679,0.0,1.172621,2.961059,1.78464,1.738079,1.481027,2.02428


In [12]:
Xt.shape

(60, 4686)

### Using tsfresh for forecasting

You can also use `tsfresh` to do univariate forecasting. To find out more about forecasting, check out our forecasting tutorial notebook.

In [13]:
from sklearn.ensemble import RandomForestRegressor

from sktime.datasets import load_airline

from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.compose import make_reduction
from sktime.forecasting.model_selection import temporal_train_test_split

In [14]:
y = load_airline()
y_train, y_test = temporal_train_test_split(y)

In [15]:
regressor = make_pipeline(
    TSFreshFeatureExtractor(
        show_warnings=False, 
        disable_progressbar=True
    ),
    RandomForestRegressor(),
)
forecaster = make_reduction(
    regressor, 
    scitype="time-series-regressor", 
    window_length=12
)

In [16]:
forecaster.fit(y_train)

RecursiveTimeSeriesRegressionForecaster(estimator=Pipeline(steps=[('tsfreshfeatureextractor',
                                                                   TSFreshFeatureExtractor(disable_progressbar=True,
                                                                  ('randomforestregressor',
                                                                   RandomForestRegressor())]),
                                        window_length=12)

In [17]:
fh = ForecastingHorizon(y_test.index, is_relative=False)
y_pred = forecaster.predict(fh)