In [118]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../')

In [119]:
from src.dataset import get_ucr_dataset

In [120]:
def load_data(dataset_dir, train=True):
    """ Function to load train or test data (from preprocessed npz format)
    """
    filename = 'X_train.npz' if train else 'X_test.npz'
    data_path = os.path.join(dataset_dir, filename)
    data = np.load(data_path)
    return data['X']
    

In [121]:
def load_train_and_test(dataset_dir):
    X_train = load_data(dataset_dir, train=True)
    X_test = load_data(dataset_dir, train=False)
    return X_train, X_test

In [156]:
data_dir = "../data/processed"
dataset = "CricketX" #"CricketX""ECGFiveDays" 
used_format = 'ts'
input_path = "../data/UEA_UCR" #original dataset for labels

In [123]:
interpolations = ['GP', 'linear']
thresholds = [0.05, 0.1, 0.3, 0.5]

In [157]:
#Loop over interpolations and thresholds:
i = interpolations[0]
j = thresholds[0]

In [158]:
dataset_dir = os.path.join(data_dir, dataset, i, f'dropped_{j}')

In [159]:
dataset_dir

'../data/processed/CricketX/GP/dropped_0.05'

In [160]:
X_train, X_test = load_train_and_test(dataset_dir)

In [198]:
Z_train, y_train, Z_test, y_test = get_ucr_dataset(input_path, dataset, used_format)

# Split dataset

In [199]:
np.random.seed(42)
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=0)
sss.get_n_splits(Z_train, y_train)

1

In [200]:
for train_index, test_index in sss.split(Z_train, y_train):
    #print(“TRAIN:“, train_index, “TEST:“, test_index)
    X_tr, X_val = Z_train[train_index], Z_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]

# Signature Transform

In [131]:
import iisignature

In [132]:
def ts2path(X):
    """ Convert single time series to path (by adding time axis))
        X: np array (n time steps, d dimensions)
    """
    if len(X.shape) == 1:
        X = X.reshape(-1,1)
    if X.shape[1] > 1:
        raise ValueError("Multivariate Time series not implemented yet!")
    n = X.shape[0]
    steps = np.arange(n).reshape(-1,1)
    path = np.concatenate((steps, X), axis=1)
    return path

In [133]:
def compute_signatures(data, trunc=6):
    """ Compute signatures of dataset of time series 
        Input:
        -data (n samples, d time steps)
        Output:
        -signatures (n_samples, n_signature_components)
    """
    n = data.shape[0]
    #first = ts2path(data[0,:])
    #compute first signature for output shape
    #s1 = iisignature.sig(first, trunc)
    #output_shape = [n, s1.shape[0]]
    
    signatures = []
    for sample in data:
        #convert sample time series to path
        path = ts2path(sample)
        #compute signature of path of ts 
        sig = iisignature.sig(path, trunc)
        #convert to 2d array for easy concatenation
        
        signatures.append(sig.reshape(-1,1))
        #append to results
    return np.concatenate(signatures, axis=1).T

    

In [134]:
def to_signatures(X_train, X_test, trunc=6):
    S_train = compute_signatures(X_train, trunc=trunc)
    S_test = compute_signatures(X_test, trunc=trunc)
    return S_train, S_test


In [233]:
S_tr, S_val = to_signatures(X_tr, X_val, 8) 

In [189]:
S_train, S_test = to_signatures(Z_train, Z_test, 8) 

# Classifier

In [164]:
from sklearn.linear_model import LogisticRegression as LR

In [165]:
est = LR(class_weight='balanced')

In [136]:
import lightgbm as lgb

In [192]:
est = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1,
                              n_estimators=400, subsample_for_bin=200000, objective= 'multiclass', class_weight='balanced',
                              min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0,
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
                                random_state=42, n_jobs=4) #objective= multiclass 

In [171]:
est.fit(S_tr,y_tr)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=400, n_jobs=4, num_leaves=31,
               objective='binary', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [167]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import balanced_accuracy_score

In [172]:
y_pred = est.predict(S_val)

In [173]:
acc = accuracy_score(y_val, y_pred)
bal = balanced_accuracy_score(y_val, y_pred)

print(f'Accuracy: {acc}, Balanced Accuracy: {bal}')

# lgm all obs: 0.56 acc
# lgm 0.05 obs: 0.17 acc

#CricketX
#sig-lgm linear, thres 0.05: 0.209 acc
#sig-lgm GP, thres 0.05: 0.178 acc

#ECGFiveDays
#sig-lgbm linear, thres 0.05:  Accuracy: 0.375, Balanced Accuracy: 0.5
#sig-lgbm GP, thres 0.05: Accuracy: 0.375, Balanced Accuracy: 0.5


Accuracy: 0.5348837209302325, Balanced Accuracy: 0.5422801735301734


In [214]:
def evaluation(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    bal = balanced_accuracy_score(y_true, y_pred)
    print(f'Accuracy: {acc}, Balanced Accuracy: {bal}')

# DTW-kNN Baseline

In [197]:
from tslearn.metrics import dtw, cdist_dtw

In [209]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
D = cdist_dtw(S_tr)

In [None]:
D_val = cdist_dtw(S_val, S_tr)

In [213]:
D_val.shape

(129, 261)

In [None]:
knn = KNeighborsClassifier(n_neighbors=1, metric='precomputed', n_jobs=4)


In [None]:
knn.fit(D, y_tr)

In [None]:
y_pred = knn.predict(D_val)

In [None]:
evaluation(y_val, y_pred)
# with X: Acc = 0.759, BAcc: 0.764

# Gridsearch

In [94]:
parameters = {'n_estimators': [100],
              'learning_rate':[0.1,0.05],
              'boosting_type': ['gbdt', 'dart'],
              'num_leaves':[15,30],
              'max_depth': [10,-1],
             }

In [95]:
from sklearn.model_selection import GridSearchCV


In [96]:
cv = GridSearchCV(est, parameters, scoring='accuracy', cv=3)

In [97]:
cv.fit(Z_train, y_train)



GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMClassifier(boosting_type='gbdt',
                                      class_weight='balanced',
                                      colsample_bytree=1.0,
                                      importance_type='split',
                                      learning_rate=0.1, max_depth=-1,
                                      min_child_samples=20,
                                      min_child_weight=0.001,
                                      min_split_gain=0.0, n_estimators=400,
                                      n_jobs=4, num_leaves=31,
                                      objective='binary', random_state=42,
                                      reg_alpha=0.0, reg_lambda=0.0,
                                      silent=True, subsample=1.0,
                                      subsample_for_bin=200000,
                                      subsample_freq=0),
             iid='warn', n_jobs=None,
 

In [75]:
y_pred = cv.best_estimator_.predict(Z_test)

acc = accuracy_score(y_test, y_pred)
bal = balanced_accuracy_score(y_test, y_pred)

print(f'Accuracy: {acc}, Balanced Accuracy: {bal}')

Accuracy: 0.49709639953542395, Balanced Accuracy: 0.5


# Randomized Search

In [182]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uniform
from sklearn.model_selection import RandomizedSearchCV
from time import time

In [179]:
param_dist ={'boosting_type': ['gbdt', 'dart'],
            'learning_rate':[0.001, 0.1, 0.05, 0.01],
            'num_leaves': sp_randint(6, 50), 
            'min_child_samples': sp_randint(2, 100), 
            'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
            'subsample': sp_uniform(loc=0.2, scale=0.8), 
            'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
            'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
            'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [185]:
# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [193]:
# run randomized search
n_iter_search = 20
rs = RandomizedSearchCV(est, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv=3, iid=False)

In [194]:
start = time()
rs.fit(S_train, y_train)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(rs.cv_results_)

RandomizedSearchCV took 98.90 seconds for 20 candidates parameter settings.
Model with rank: 1
Mean validation score: 0.645 (std: 0.044)
Parameters: {'boosting_type': 'gbdt', 'colsample_bytree': 0.9029972228266827, 'learning_rate': 0.1, 'min_child_samples': 2, 'min_child_weight': 1, 'num_leaves': 11, 'reg_alpha': 0, 'reg_lambda': 10, 'subsample': 0.9997741386289045}

Model with rank: 2
Mean validation score: 0.619 (std: 0.051)
Parameters: {'boosting_type': 'dart', 'colsample_bytree': 0.820581478875472, 'learning_rate': 0.1, 'min_child_samples': 10, 'min_child_weight': 0.001, 'num_leaves': 32, 'reg_alpha': 0.1, 'reg_lambda': 10, 'subsample': 0.6106609349539143}

Model with rank: 3
Mean validation score: 0.529 (std: 0.018)
Parameters: {'boosting_type': 'gbdt', 'colsample_bytree': 0.8108387035323276, 'learning_rate': 0.001, 'min_child_samples': 14, 'min_child_weight': 0.01, 'num_leaves': 11, 'reg_alpha': 0.1, 'reg_lambda': 1, 'subsample': 0.7805756067106879}



In [195]:
y_pred = rs.best_estimator_.predict(S_test)

acc = accuracy_score(y_test, y_pred)
bal = balanced_accuracy_score(y_test, y_pred)

print(f'Accuracy: {acc}, Balanced Accuracy: {bal}')

Accuracy: 0.6230769230769231, Balanced Accuracy: 0.6325601330102878
