In [7]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('../')

In [8]:
from src.dataset import get_ucr_dataset

In [9]:
def load_data(dataset_dir, train=True):
    """ Function to load train or test data (from preprocessed npz format)
    """
    filename = 'X_train.npz' if train else 'X_test.npz'
    data_path = os.path.join(dataset_dir, filename)
    data = np.load(data_path)
    return data['X']
    

In [10]:
def load_train_and_test(dataset_dir):
    X_train = load_data(dataset_dir, train=True)
    X_test = load_data(dataset_dir, train=False)
    return X_train, X_test

In [11]:
data_dir = "../data/processed"
dataset = "CricketX"
used_format = 'ts'
input_path = "../data/UEA_UCR" #original dataset for labels

In [12]:
interpolations = ['GP', 'linear']
thresholds = [0.05, 0.1, 0.3, 0.5]

In [43]:
#Loop over interpolations and thresholds:
i = interpolations[1]
j = thresholds[0]

In [44]:
dataset_dir = os.path.join(data_dir, dataset, i, f'dropped_{j}')

In [45]:
dataset_dir

'../data/processed/CricketX/linear/dropped_0.05'

In [46]:
X_train, X_test = load_train_and_test(dataset_dir)

In [47]:
Z_train, y_train, Z_test, y_test = get_ucr_dataset(input_path, dataset, used_format)

# Split dataset

In [48]:
from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.33, random_state=0)
sss.get_n_splits(X_train, y_train)

1

In [49]:
for train_index, test_index in sss.split(Z_train, y_train):
    #print(“TRAIN:“, train_index, “TEST:“, test_index)
    X_tr, X_val = X_train[train_index], X_train[test_index]
    y_tr, y_val = y_train[train_index], y_train[test_index]

# Signature Transform

In [55]:
import iisignature

In [80]:
def ts2path(X):
    """ Convert single time series to path (by adding time axis))
        X: np array (n time steps, d dimensions)
    """
    if len(X.shape) == 1:
        X = X.reshape(-1,1)
    if X.shape[1] > 1:
        raise ValueError("Multivariate Time series not implemented yet!")
    n = X.shape[0]
    steps = np.arange(n).reshape(-1,1)
    path = np.concatenate((steps, X), axis=1)
    return path

In [115]:
def compute_signatures(data, trunc=4):
    """ Compute signatures of dataset of time series 
        data: (n samples, d time steps)
    """
    n = data.shape[0]
    #first = ts2path(data[0,:])
    #compute first signature for output shape
    #s1 = iisignature.sig(first, trunc)
    #output_shape = [n, s1.shape[0]]
    
    signatures = []
    for sample in data:
        #convert sample time series to path
        path = ts2path(sample)
        #compute signature of path of ts 
        sig = iisignature.sig(path, trunc)
        #convert to 2d array for easy concatenation
        
        signatures.append(sig.reshape(-1,1))
        #append to results
    return signatures

    

In [113]:
sigs = compute_signatures(X_train)

In [114]:
np.concatenate(sigs, axis=1).shape

(30, 390)

# Classifier

array([-1.04551366e+00, -1.04551366e+00, -1.04551366e+00, -1.04551366e+00,
       -1.04551366e+00, -1.04551366e+00, -9.51581179e-01, -8.57648695e-01,
       -7.63716210e-01, -6.70302589e-01, -5.93665575e-01, -5.17028561e-01,
       -4.40391547e-01, -3.63754532e-01, -2.87117518e-01, -2.10480504e-01,
       -1.33843490e-01, -5.72064755e-02,  1.94305387e-02,  9.60675529e-02,
        1.72704567e-01,  2.49341581e-01,  3.25978596e-01,  4.02615610e-01,
        4.79252624e-01,  5.55889638e-01,  6.32526653e-01,  7.09163667e-01,
        7.85800681e-01,  8.62437695e-01,  9.39074709e-01,  1.01571172e+00,
        1.09234874e+00,  1.16898575e+00,  1.24562277e+00,  1.32225978e+00,
        1.39889679e+00,  1.47553381e+00,  1.55217082e+00,  1.62880784e+00,
        1.70544485e+00,  1.78208187e+00,  1.85871888e+00,  1.93535589e+00,
        2.01199291e+00,  2.05905904e+00,  1.93855682e+00,  1.81805459e+00,
        1.69755237e+00,  1.57705015e+00,  1.45654793e+00,  1.33604571e+00,
        1.21554348e+00,  

array([[ 0, 20],
       [ 1, 21],
       [ 2, 22],
       [ 3, 23],
       [ 4, 24],
       [ 5, 25],
       [ 6, 26],
       [ 7, 27],
       [ 8, 28],
       [ 9, 29]])

In [31]:
import lightgbm as lgb

In [50]:
est = lgb.LGBMClassifier(boosting_type='gbdt', num_leaves=31, max_depth=-1, learning_rate=0.1,
                              n_estimators=300, subsample_for_bin=200000, objective= 'multiclass', class_weight='balanced',
                              min_split_gain=0.0, min_child_weight=0.001, min_child_samples=20, subsample=1.0,
                              subsample_freq=0, colsample_bytree=1.0, reg_alpha=0.0, reg_lambda=0.0,
                                random_state=42, n_jobs=4)

In [51]:
est.fit(X_tr,y_tr)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=1.0, importance_type='split', learning_rate=0.1,
               max_depth=-1, min_child_samples=20, min_child_weight=0.001,
               min_split_gain=0.0, n_estimators=300, n_jobs=4, num_leaves=31,
               objective='multiclass', random_state=42, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [25]:
from sklearn.metrics import accuracy_score

In [52]:
y_pred = est.predict(X_val)

In [53]:
acc = accuracy_score(y_val, y_pred)
acc
# lgm all obs: 0.56 acc
# lgm 0.05 obs: 0.17 acc


0.32558139534883723

# Gridsearch

In [51]:
parameters = {'n_estimators': [100],
              'learning_rate':[0.1,0.05],
              'boosting_type': ['gbdt', 'dart'],
              'num_leaves':[15,30],
              'max_depth': [10,-1],
             }