In [54]:
%matplotlib inline
import typing

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn.decomposition
import sklearn.pipeline
import sklearn.preprocessing

import pymfe.mfe
import tspymfe._embed

In [2]:
# Note: using only groups that has at least one meta-feature that can be extracted
# from a unsupervised dataset
groups = ("general", "statistical", "info-theory", "complexity", "itemset", "concept")
summary = "all"

extractor = pymfe.mfe.MFE(features="all",
                          summary=summary,
                          groups=groups)

In [3]:
data_train = pd.read_csv("../2_exploring_subsample/subsample_train.csv", header=0, index_col="timeseries_id")
data_test = pd.read_csv("../2_exploring_subsample/subsample_test.csv", header=0, index_col="timeseries_id")

In [4]:
assert data_train.shape[0] > data_test.shape[0]

data_train.head()

Unnamed: 0_level_0,category,inst_ind,datapoints
timeseries_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
e0b36e39-3872-11e8-8680-0242ac120002,Beta noise,25254,"0.73617,0.99008,0.71331,0.87094,0.75527,0.9912..."
81db0cf2-3883-11e8-8680-0242ac120002,Relative humidity,14878,"95.5,79,86.75,8.75,62.75,98.75,79.74,44.75,92...."
380eb353-387a-11e8-8680-0242ac120002,RR,6577,"0.6328,0.6328,0.625,0.6328,0.625,0.625,0.6172,..."
f33f461c-3871-11e8-8680-0242ac120002,Tremor,27821,"-0.6,1.5,1.5,0.1,0.9,0.6,0.3,-0.2,0.7,1,0.1,1...."
7bcad309-3874-11e8-8680-0242ac120002,Noisy sinusoids,14226,"0.38553,0.2014,1.8705,0.47883,0.33958,0.009558..."


In [5]:
# Note: using at most the last 1024 observations of each time-series
size_threshold = 1024
embed_dims, embed_lags = np.zeros((2, data_train.shape[0]), dtype=np.uint16)

# Number of iterations until to save results to .csv
to_csv_it_num = 16

# Note: using dummy data to get the metafeature names
mtf_names = extractor.fit(np.arange(16).reshape(-1, 2),
                          suppress_warnings=True).extract(suppress_warnings=True)[0]

# Note: filepath to store the results
filename_train = "metafeatures_pymfe_train.csv"
filename_test = "metafeatures_pymfe_test.csv"

def recover_data(filepath: str,
                 index: typing.Collection[str],
                 def_shape: typing.Tuple[int, int]) -> typing.Tuple[pd.DataFrame, int]:
    """Recover data from the previous experiment run."""
    filled_len = 0
    
    try:
        results = pd.read_csv(filepath, index_col=0)
        
        assert results.shape == def_shape

        # Note: find the index where the previous run was interrupted
        while filled_len < results.shape[0] and not results.iloc[filled_len, :].isnull().all():
            filled_len += 1

    except (AssertionError, FileNotFoundError):
        results = pd.DataFrame(index=index, columns=mtf_names)
    
    return results, filled_len


results_train, start_ind_train = recover_data(filepath=filename_train,
                                              index=data_train.index,
                                              def_shape=(data_train.shape[0], len(mtf_names)))

results_test, start_ind_test = recover_data(filepath=filename_test,
                                            index=data_test.index,
                                            def_shape=(data_test.shape[0], len(mtf_names)))

In [6]:
assert results_train.shape == (data_train.shape[0], len(mtf_names))
assert results_test.shape == (data_test.shape[0], len(mtf_names))

print("Train start index:", start_ind_train)
print("Test start index:", start_ind_test)

Train start index: 1
Test start index: 0


In [7]:
print("Number of meta-features per dataset:", len(mtf_names))
print(mtf_names)

Number of meta-features per dataset: 1407
['attr_conc.count', 'attr_conc.histogram.0', 'attr_conc.histogram.1', 'attr_conc.histogram.2', 'attr_conc.histogram.3', 'attr_conc.histogram.4', 'attr_conc.histogram.5', 'attr_conc.histogram.6', 'attr_conc.histogram.7', 'attr_conc.histogram.8', 'attr_conc.histogram.9', 'attr_conc.iq_range', 'attr_conc.kurtosis', 'attr_conc.max', 'attr_conc.mean', 'attr_conc.median', 'attr_conc.min', 'attr_conc.nancount', 'attr_conc.nanhistogram.0', 'attr_conc.nanhistogram.1', 'attr_conc.nanhistogram.2', 'attr_conc.nanhistogram.3', 'attr_conc.nanhistogram.4', 'attr_conc.nanhistogram.5', 'attr_conc.nanhistogram.6', 'attr_conc.nanhistogram.7', 'attr_conc.nanhistogram.8', 'attr_conc.nanhistogram.9', 'attr_conc.naniq_range', 'attr_conc.nankurtosis', 'attr_conc.nanmax', 'attr_conc.nanmean', 'attr_conc.nanmedian', 'attr_conc.nanmin', 'attr_conc.nanpnorm', 'attr_conc.nanpowersum', 'attr_conc.nanquantiles.0', 'attr_conc.nanquantiles.1', 'attr_conc.nanquantiles.2', 'attr

In [8]:
def extract_metafeatures(data: pd.DataFrame, results: pd.DataFrame, start_ind: int, output_file: str) -> None:
    print(f"Starting extraction from index {start_ind}...")
    for i, (cls, _, vals) in enumerate(data.iloc[start_ind:, :].values, start_ind):
        ts = np.asarray(vals.split(",")[-size_threshold:], dtype=float)

        embed_lags[i] = tspymfe._embed.embed_lag(ts=ts, max_nlags=16)

        embed_dims[i] = max(2, tspymfe._embed.ft_emb_dim_cao(ts=ts,
                                                             lag=embed_lags[i],
                                                             dims=16,
                                                             tol_threshold=0.2))

        ts_embed = tspymfe._embed.embed_ts(ts=ts,
                                           dim=embed_dims[i],
                                           lag=embed_lags[i])
        
        extractor.fit(ts_embed, suppress_warnings=True)
        res = extractor.extract(suppress_warnings=True)
        
        results.iloc[i, :] = res[1]

        if i % to_csv_it_num == 0:
            results.to_csv(output_file)
            print(f"Saved results at index {i} in file {output_file}.")
    
    results.to_csv(output_file)

In [9]:
extract_metafeatures(data=data_train,
                     results=results_train,
                     start_ind=start_ind_train,
                     output_file=filename_train)

extract_metafeatures(data=data_test,
                     results=results_test,
                     start_ind=start_ind_test,
                     output_file=filename_test)

Starting extraction from index 1...




Saved results at index 16 in file metafeatures_pymfe_train.csv.




Saved results at index 32 in file metafeatures_pymfe_train.csv.




Saved results at index 48 in file metafeatures_pymfe_train.csv.




Saved results at index 64 in file metafeatures_pymfe_train.csv.




Saved results at index 80 in file metafeatures_pymfe_train.csv.




Saved results at index 96 in file metafeatures_pymfe_train.csv.




Saved results at index 112 in file metafeatures_pymfe_train.csv.




Saved results at index 128 in file metafeatures_pymfe_train.csv.




Saved results at index 144 in file metafeatures_pymfe_train.csv.




Saved results at index 160 in file metafeatures_pymfe_train.csv.




Saved results at index 176 in file metafeatures_pymfe_train.csv.




Saved results at index 192 in file metafeatures_pymfe_train.csv.




Saved results at index 208 in file metafeatures_pymfe_train.csv.




Saved results at index 224 in file metafeatures_pymfe_train.csv.




Saved results at index 240 in file metafeatures_pymfe_train.csv.




Saved results at index 256 in file metafeatures_pymfe_train.csv.




Saved results at index 272 in file metafeatures_pymfe_train.csv.




Saved results at index 288 in file metafeatures_pymfe_train.csv.




Saved results at index 304 in file metafeatures_pymfe_train.csv.




Saved results at index 320 in file metafeatures_pymfe_train.csv.




Saved results at index 336 in file metafeatures_pymfe_train.csv.




Saved results at index 352 in file metafeatures_pymfe_train.csv.




Saved results at index 368 in file metafeatures_pymfe_train.csv.




Saved results at index 384 in file metafeatures_pymfe_train.csv.




Saved results at index 400 in file metafeatures_pymfe_train.csv.




Saved results at index 416 in file metafeatures_pymfe_train.csv.




Saved results at index 432 in file metafeatures_pymfe_train.csv.




Saved results at index 448 in file metafeatures_pymfe_train.csv.




Saved results at index 464 in file metafeatures_pymfe_train.csv.




Saved results at index 480 in file metafeatures_pymfe_train.csv.




Saved results at index 496 in file metafeatures_pymfe_train.csv.




Saved results at index 512 in file metafeatures_pymfe_train.csv.




Saved results at index 528 in file metafeatures_pymfe_train.csv.




Saved results at index 544 in file metafeatures_pymfe_train.csv.




Saved results at index 560 in file metafeatures_pymfe_train.csv.




Saved results at index 576 in file metafeatures_pymfe_train.csv.




Saved results at index 592 in file metafeatures_pymfe_train.csv.




Saved results at index 608 in file metafeatures_pymfe_train.csv.




Saved results at index 624 in file metafeatures_pymfe_train.csv.




Saved results at index 640 in file metafeatures_pymfe_train.csv.




Saved results at index 656 in file metafeatures_pymfe_train.csv.




Saved results at index 672 in file metafeatures_pymfe_train.csv.




Saved results at index 688 in file metafeatures_pymfe_train.csv.




Saved results at index 704 in file metafeatures_pymfe_train.csv.




Saved results at index 720 in file metafeatures_pymfe_train.csv.




Starting extraction from index 0...




Saved results at index 0 in file metafeatures_pymfe_test.csv.




Saved results at index 16 in file metafeatures_pymfe_test.csv.




Saved results at index 32 in file metafeatures_pymfe_test.csv.




Saved results at index 48 in file metafeatures_pymfe_test.csv.




Saved results at index 64 in file metafeatures_pymfe_test.csv.




Saved results at index 80 in file metafeatures_pymfe_test.csv.




Saved results at index 96 in file metafeatures_pymfe_test.csv.




Saved results at index 112 in file metafeatures_pymfe_test.csv.




Saved results at index 128 in file metafeatures_pymfe_test.csv.




Saved results at index 144 in file metafeatures_pymfe_test.csv.




Saved results at index 160 in file metafeatures_pymfe_test.csv.




Saved results at index 176 in file metafeatures_pymfe_test.csv.




In [39]:
# Note: analysing the NaN count.
nan_count = results_train.isnull().sum()
nan_count.iloc[nan_count.to_numpy().nonzero()].value_counts() / results_train.shape[0]

310    0.152174
584    0.010870
736    0.001359
dtype: float64

In [55]:
results_train.dropna(axis=1, inplace=True)
results_train.shape

(736, 1286)

In [64]:
pca_pipeline = sklearn.pipeline.Pipeline((
    ("zscore", sklearn.preprocessing.StandardScaler()),
    ("pca", sklearn.decomposition.PCA(n_components=0.95, random_state=16))
))

pca_pipeline.fit(results_train)

Pipeline(steps=[('zscore', StandardScaler()),
                ('pca', PCA(n_components=0.95, random_state=16))])

In [65]:
results_subset_train = pca_pipeline.transform(results_train.values)
results_subset_train.shape

(736, 105)