In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sktime.classification.compose import (
    ColumnEnsembleClassifier,
    TimeSeriesForestClassifier,
)

from sktime.classification.dictionary_based import (
    #IndividualBOSS,
    BOSSEnsemble,
    #ContractableBOSS,
    TemporalDictionaryEnsemble,
    IndividualTDE,
    WEASEL,
    #MUSE,
)

from sktime.classification.shapelet_based import (
    #MrSEQLClassifier,
    ShapeletTransformClassifier,
)

from sktime.classification.interval_based import TimeSeriesForest


from sktime.classification.distance_based import (
    ElasticEnsemble,
    ProximityTree,
    ProximityForest,
    ProximityStump,
)

from sktime.classification.frequency_based import RandomIntervalSpectralForest

from sklearn.metrics import roc_curve, auc, roc_auc_score

# from sktime.classification.distance_based import KNeighborsTimeSeriesClassifier
from sktime.datasets import load_basic_motions
#from sktime.datasets import load_uschange
#from sktime.transformers.panel.compose import ColumnConcatenator
import timeit, time

In [2]:
# The data set used here is a part of a student project where four students performed 
# four activities whilst wearing a smart watch. The watch collects 3D accelerometer and a 3D gyroscope.
# It consists of four classes, which are walking, resting, running and badminton. 
# Participants were required to record motion a total of five times, and the data is sampled 
# once every tenth of a second, for a ten second period.


X, y = load_basic_motions(return_X_y=True)
#X, y = load_uschange(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

(60, 6) (60,) (20, 6) (20,)


In [3]:
#  multivariate input data
X_train.head()

Unnamed: 0,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5
9,0 -0.407421 1 -0.407421 2 2.355158 3...,0 1.413374 1 1.413374 2 -3.928032 3...,0 0.092782 1 0.092782 2 -0.211622 3...,0 -0.066584 1 -0.066584 2 -3.630177 3...,0 0.223723 1 0.223723 2 -0.026634 3...,0 0.135832 1 0.135832 2 -1.946925 3...
24,0 0.383922 1 0.383922 2 -0.272575 3...,0 0.302612 1 0.302612 2 -1.381236 3...,0 -0.398075 1 -0.398075 2 -0.681258 3...,0 0.071911 1 0.071911 2 -0.761725 3...,0 0.175783 1 0.175783 2 -0.114525 3...,0 -0.087891 1 -0.087891 2 -0.503377 3...
5,0 -0.357300 1 -0.357300 2 -0.005055 3...,0 -0.584885 1 -0.584885 2 0.295037 3...,0 -0.792751 1 -0.792751 2 0.213664 3...,0 0.074574 1 0.074574 2 -0.157139 3...,0 0.159802 1 0.159802 2 -0.306288 3...,0 0.023970 1 0.023970 2 1.230478 3...
7,0 -0.352746 1 -0.352746 2 -1.354561 3...,0 0.316845 1 0.316845 2 0.490525 3...,0 -0.473779 1 -0.473779 2 1.454261 3...,0 -0.327595 1 -0.327595 2 -0.269001 3...,0 0.106535 1 0.106535 2 0.021307 3...,0 0.197090 1 0.197090 2 0.460763 3...
34,0 0.052231 1 0.052231 2 -0.54804...,0 -0.730486 1 -0.730486 2 0.70700...,0 -0.518104 1 -0.518104 2 -1.179430 3...,0 -0.159802 1 -0.159802 2 -0.239704 3...,0 -0.045277 1 -0.045277 2 0.023970 3...,0 -0.029297 1 -0.029297 2 0.29829...


In [4]:
# multi-class target variable
np.unique(y_train)

# Multivariate classification
# sktime offers three main ways of solving multivariate time series classification problems:

# 1. Concatenation of time series columns into a single long time series column via ColumnConcatenator and 
# apply a classifier to the concatenated data,
# 2. Column-wise ensembling via ColumnEnsembleClassifier in which one classifier is fitted for each time series column 
# and their predictions aggregated,
# 3. Bespoke estimator-specific methods for handling multivariate time series data, e.g. 
# finding shapelets in multidimensional spaces.

array(['badminton', 'running', 'standing', 'walking'], dtype=object)

In [5]:
# Column ensembling
# We can also fit one classifier for each time series column and then aggregated their predictions. 
# The interface is similar to the familiar ColumnTransformer from sklearn.


# 14! = 87,178,291,200 => 87,178,291,200 * 18 / 3600  / 24 => 18162144 days => parallize => 18162144 / 8 => 2270268 days


clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF0", TimeSeriesForestClassifier(n_estimators=100), [0]),
        ("BOSSEnsemble1", BOSSEnsemble(max_ensemble_size=5), [1]),
    ]
)
%time clf.fit(X_train, y_train)
clf.score(X_test, y_test)
#clf.predict(X_test)
#print(roc_auc_score(y_test, clf.predict(X_test)))

#from sklearn.preprocessing import label_binarize
#y_test_b = label_binarize(y_test, classes=[0, 1, 2, 4])

#false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test_b, clf.predict(X_test))
#auc(false_positive_rate, true_positive_rate)

Wall time: 17.2 s


1.0

In [6]:
clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF0", TimeSeriesForestClassifier(n_estimators=100), [0]),
    ]
)

%time clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Wall time: 1.06 s


1.0

In [7]:
clf = ColumnEnsembleClassifier(
    estimators=[
        ("BOSSEnsemble0", BOSSEnsemble(max_ensemble_size=5), [0]),
    ]
)
%time clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Wall time: 16.2 s


0.95

In [8]:
# 8! = 40320 => 40320 * 18 / 3600 / 24 => 8.4 days => parallelize => 8.4 / 8 => 1 day
# 9! = 362880 => 362880 * 18 / 3600 / 24 => 75.6 days => parallize => 75.6 / 8 => 9.45 days
# 10! => 90.45 days


clf = ColumnEnsembleClassifier(
    estimators=[
        ("TSF0", TimeSeriesForestClassifier(n_estimators=100), [0]),
        #("BOSSEnsemble1", BOSSEnsemble(max_ensemble_size=5), [1]),
        ("BOSSEnsemble1", BOSSEnsemble(random_state=47), [1]),
        ("WEASEL2", WEASEL(binning_strategy="equi-depth", anova=False, random_state=47), [2]),
        ("TemporalDictionaryEnsemble3", TemporalDictionaryEnsemble(n_parameter_samples=250,max_ensemble_size=50,
                                                                   randomly_selected_params=50,random_state=47,), [3]),
        #("MUSE4", MUSE(), [4]),
    ]
)
%time clf.fit(X_train, y_train)
clf.score(X_test, y_test)

Wall time: 46.5 s


1.0

In [9]:
import multiprocessing as mp
print("Number of processors: ", mp.cpu_count())

Number of processors:  8


In [10]:
import numpy as np
from time import time

# Prepare data
np.random.RandomState(100)
arr = np.random.randint(0, 10, size=[200000, 5])
data = arr.tolist()
data[:5]


# Solution Without Paralleization

def howmany_within_range(row, minimum, maximum):
    """Returns how many numbers lie within `maximum` and `minimum` in a given `row`"""
    count = 0
    for n in row:
        if minimum <= n <= maximum:
            count = count + 1
    return count


results = []
for row in data:
    results.append(howmany_within_range(row, minimum=4, maximum=8))

print(results[:10])

[3, 2, 3, 2, 1, 3, 1, 2, 2, 2]


In [None]:
# PROBLEM in NOTEBOOK move function howmany_within_range to .py file

# Parallelizing using Pool.apply()

import multiprocessing as mp

# Step 1: Init multiprocessing.Pool()
pool = mp.Pool(mp.cpu_count())

# Step 2: `pool.apply` the `howmany_within_range()`
results = [pool.apply(howmany_within_range, args=(row, 4, 8)) for row in data]

# Step 3: Don't forget to close
pool.close()    

print(results[:10])