In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sktime # !pip install sktime
import sklearn
from pyts.approximation import PiecewiseAggregateApproximation
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from tslearn import metrics 
from datetime import date, time
from importlib import reload
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import pdist,squareform
import itertools 
import os

In [2]:
def euclid_dist(t1,t2):
    return np.sqrt(np.sum((t1-t2)**2))

In [3]:
metric_dict = {
    "lcss"  : metrics.lcss,
    "gak" : metrics.gak,
    "dtw" : metrics.dtw,
    "euclidian" : euclid_dist
}

In [4]:
Trace_TRAIN = pd.read_csv("Trace/Trace_TRAIN.txt",delim_whitespace=True,header=None)
Trace_TRAIN.columns = ["target"] + [i for i in np.arange(1,Trace_TRAIN.shape[1])]

In [5]:
Trace_TEST = pd.read_csv("Trace/Trace_TEST.txt",delim_whitespace=True,header=None)
Trace_TEST.columns = ["target"] + [i for i in np.arange(1,Trace_TEST.shape[1])]

In [6]:
global_constraints = ["sakoe_chiba","itakura"]
sakoe_chiba_radius_values = [10]
itakura_max_slope_values = [10]
lcss_eps = [3,10]
sigma_values = [1,2]
k_neighbor = [1,3,5]
n_split = 10
n_repeat = 5
combinations = list(itertools.product(np.arange(n_repeat), np.arange(n_split)))
rskf = RepeatedStratifiedKFold(n_splits=n_split, n_repeats=n_repeat,
    random_state=42)
data_dict = ["Original","DT","Diff"]

In [7]:
y = Trace_TRAIN["target"].values
X =  Trace_TRAIN.drop("target",1).values

In [8]:
folder_name =  "indices_for_trace"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

counter = 0
for train_index, test_index in rskf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)

    np.save(f"{folder_name}/{combinations[counter][0]}_{combinations[counter][1]}_train_indices.npy",train_index)
    np.save(f"{folder_name}/{combinations[counter][0]}_{combinations[counter][1]}_val_indices.npy",test_index)

    counter += 1

TRAIN: [ 0  1  2  3  4  5  6  8  9 10 11 12 14 15 16 18 19 20 21 22 23 24 26 28
 29 30 33 34 35 36 37 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
 56 57 58 59 60 61 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
 81 82 83 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99] TEST: [ 7 13 17 25 27 31 32 38 62 84]
TRAIN: [ 0  1  2  4  5  7  8  9 10 11 12 13 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 31 32 33 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
 52 54 56 57 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79
 80 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 98 99] TEST: [ 3  6 14 36 53 55 58 59 81 97]
TRAIN: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 17 18 19 20 21 22 24 25 26
 27 28 29 30 31 32 33 34 35 36 38 39 40 41 42 43 45 47 49 50 51 52 53 54
 55 56 57 58 59 60 61 62 63 64 65 66 67 68 70 71 72 73 74 75 76 77 78 79
 80 81 82 83 84 85 86 88 89 90 91 92 93 94 95 96 97 99] TEST: [ 0 16 23 37 44 46 48 69 87 98]
TRAIN: [ 0  1  2  3  4  5  6  7  8 10 11

In [9]:

result_all = []
hyper_all = []
    
for params in [["lcss","eps"],["dtw","itakura"],["dtw","sakoe_chiba"],["gak","sigma"],["euclidian","No"]]:
    distance_metric = params[0]
    constraint = params[1]
    #print(distance_metric,constraint)
    if constraint == "eps":
        constraint_value_array = lcss_eps
    
    if constraint == "sigma":
        constraint_value_array = sigma_values
    elif constraint == "itakura":
        constraint_value_array = itakura_max_slope_values
    elif constraint == "sakoe_chiba":
        constraint_value_array = sakoe_chiba_radius_values
    elif constraint == "No":
        constraint_value_array = ["Euclidian"]

    for constraint_value in constraint_value_array:
        
        for neighbor in k_neighbor:
            fea_columns = list(Trace_TRAIN.columns[1:])
            train_ts_data = Trace_TRAIN
            reg_data = Trace_TRAIN[fea_columns].T.reset_index(drop=True)
            dt_data = []
            for col in reg_data.columns:
                reg = DecisionTreeRegressor(min_samples_split = 3, min_samples_leaf = 2, max_depth= 3,ccp_alpha=0)
                reg_data_ex = reg_data[col].reset_index()
                reg_data_ex.columns = ["time","ts"]
                dt = reg.fit(reg_data_ex[["time"]],reg_data_ex.ts).predict(reg_data_ex[["time"]])
                dt_data.append(dt)
           
            train_ts_dt_data = pd.DataFrame(np.stack(dt_data),columns = fea_columns,index=Trace_TRAIN[fea_columns].index)
            train_ts_diff_data = Trace_TRAIN[fea_columns].diff(axis=1).dropna(axis=1)

            for i,ts_data_type in enumerate([train_ts_data,train_ts_dt_data,train_ts_diff_data]):

                hyper_all.append([distance_metric,constraint,constraint_value,neighbor,data_dict[i]])
                print("Data Type", data_dict[i], " Constraint: " ,constraint," Constraint Value: ",constraint_value,"# of Neighbors: ",neighbor)

                if i!=2:
                    X = ts_data_type[fea_columns]
                else:
                    X = ts_data_type[fea_columns[1:]]
                y = train_ts_data["target"].replace(-1,2)
                accuracy_list = []
                if constraint=="sigma":
                    metric_args = {"sigma": constraint_value}
                elif constraint == "eps":
                    metric_args = {"eps": constraint_value}
                elif constraint == "itakura":
                    metric_args = {"global_constraint": constraint,"itakura_max_slope":constraint_value}

                elif constraint == "sakoe_chiba":
                    metric_args = {"global_constraint": constraint,"sakoe_chiba_radius":constraint_value}
                else:
                    metric_args = {}

                dm = pdist(X, lambda x,y:  metric_dict[distance_metric](x,y,**metric_args))
                distance_matrix = squareform(dm)
                np.fill_diagonal(distance_matrix,np.inf)

                for comb in combinations:
                    train_index = np.load(f"{folder_name}/{comb[0]}_{comb[1]}_train_indices.npy")
                    test_index = np.load(f"{folder_name}/{comb[0]}_{comb[1]}_val_indices.npy")
                    X_train = X.iloc[train_index]
                    X_val = X.iloc[test_index]
                    y_train = y.iloc[train_index].values.astype(np.int64)
                    y_test = y.iloc[test_index].values.astype(np.int64)
                     
                    preds = []
                    for test_idx in test_index:
                        sorted_indices = np.argsort(distance_matrix[test_idx][train_index])
                        if neighbor == 1:
                            pred = y_train[sorted_indices[:neighbor]][0]
                        else:
                            pred = np.bincount((y_train).astype(np.int64)[sorted_indices[:neighbor]]).argmax()
                        preds.append(pred)

                    accuracy = sklearn.metrics.accuracy_score(y_test,preds)
                    accuracy_list.append([comb[0],comb[1],accuracy])
                res_df = pd.DataFrame(accuracy_list,columns = ["Repeat Id","Split Id","Accuracy"])
                #res_df.Accuracy.plot.box()
                #plt.title("Accuracy Distribution for Different Folds\n" + f" distance metric: {distance_metric}, K (# of neighbors) : {neighbor}, Data Type : {data_dict[i]}")
                #plt.show()

                print("Mean Accuracy is : ", round(100*res_df.Accuracy.mean(),2))
                print("Std Accuracy is : ", round(100*res_df.Accuracy.std(),2))
                print("="*50)
                result_all.append([round(100*res_df.Accuracy.mean(),2),round(100*res_df.Accuracy.std(),2)])



Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  16.0
Std Accuracy is :  8.57
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  25.0
Std Accuracy is :  6.14
Data Type Diff  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  26.0
Std Accuracy is :  4.95
Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  10.2
Std Accuracy is :  7.69
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  23.6
Std Accuracy is :  6.63
Data Type Diff  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  26.4
Std Accuracy is :  6.93
Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  5
Mean Accuracy is :  11.8
Std Accuracy is :  7.74
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  5
Mean Accuracy is :  22.2
Std Accuracy is :  7.08
Data Type 

Mean Accuracy is :  25.4
Std Accuracy is :  5.03
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  1
Mean Accuracy is :  25.4
Std Accuracy is :  5.03
Data Type Original  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  25.6
Std Accuracy is :  5.01
Data Type DT  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  25.6
Std Accuracy is :  5.01
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  25.6
Std Accuracy is :  5.01
Data Type Original  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  25.0
Std Accuracy is :  5.44
Data Type DT  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  25.0
Std Accuracy is :  5.44
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  25.0
Std Accuracy is :  5.44
Data Type Original  Constraint:  No  Constraint Value:  Euclidian # of N

In [12]:
results = pd.concat([pd.DataFrame(hyper_all,columns = ["Metric","DTW Type","Constraint Value","K","Data Type"]),pd.DataFrame(result_all,columns = ["Mean Acc","Std Acc"])],1)


In [13]:
results.sort_values(["Mean Acc","Std Acc"],ascending=[False,True]).head(10)

Unnamed: 0,Metric,DTW Type,Constraint Value,K,Data Type,Mean Acc,Std Acc
18,dtw,itakura,10,1,Original,100.0,0.0
21,dtw,itakura,10,3,Original,100.0,0.0
27,dtw,sakoe_chiba,10,1,Original,100.0,0.0
23,dtw,itakura,10,3,Diff,99.8,1.41
24,dtw,itakura,10,5,Original,99.6,2.83
26,dtw,itakura,10,5,Diff,99.2,2.74
20,dtw,itakura,10,1,Diff,98.8,3.28
30,dtw,sakoe_chiba,10,3,Original,98.6,3.51
29,dtw,sakoe_chiba,10,1,Diff,98.2,3.88
32,dtw,sakoe_chiba,10,3,Diff,95.8,6.73


In [14]:
best_params = results.head(1).values[0][:-2]

In [15]:
best_params

array(['lcss', 'eps', 3, 1, 'Original'], dtype=object)

In [16]:
params_dict = dict(zip(["Metric","DTW Type","Constraint Value","K","Data Type"],best_params))


In [17]:
params_dict

{'Metric': 'lcss',
 'DTW Type': 'eps',
 'Constraint Value': 3,
 'K': 1,
 'Data Type': 'Original'}

In [18]:
### test data prediction with best parameters
print("="*40)
X_test = Trace_TEST[fea_columns] 
y_test = Trace_TEST["target"].values
X  = Trace_TRAIN[fea_columns] 
y  = Trace_TRAIN["target"].values



In [19]:
reg_data = Trace_TRAIN[fea_columns].T.reset_index(drop=True)

In [20]:
dt_data = []
for col in reg_data.columns:
    reg = DecisionTreeRegressor(min_samples_split = 3, min_samples_leaf = 2, max_depth= 3,ccp_alpha=0)
    reg_data_ex = reg_data[col].reset_index()
    reg_data_ex.columns = ["time","ts"]
    dt = reg.fit(reg_data_ex[["time"]],reg_data_ex.ts).predict(reg_data_ex[["time"]])
    dt_data.append(dt)
train_ts_dt_data = pd.DataFrame(np.stack(dt_data),columns = fea_columns,index=train_ts_data.index)
train_ts_diff_data = train_ts_data[fea_columns].diff(axis=1).dropna(axis=1)


In [21]:
constraint = params_dict["Constraint Value"]
distance_metric = params_dict["Metric"]
neighbor = params_dict["K"]
data_type = params_dict["Data Type"]

if constraint=="gamma":
    metric_args = {"g": constraint_value}
elif constraint == "eps":
    metric_args = {"eps": constraint_value}
elif constraint == "itakura":
    metric_args = {"global_constraint": constraint,"itakura_max_slope":constraint_value}

elif constraint == "sakoe_chiba":
    metric_args = {"global_constraint": constraint,"sakoe_chiba_radius":constraint_value}
else:
    metric_args = {}

In [22]:
preds = []

if data_type == "Original":
    train_X = X
elif data_type == "DT":
    train_X = train_ts_dt_data
elif data_type == "Diff":
    train_X = train_ts_diff_data
for i in range(X_test.shape[0]):    
    if data_type in ["Original","DT"]:
        distances = [metric_dict[distance_metric](x.values,X_test.values[i,:],**metric_args) for idx,x in train_X.iterrows()]
    else: # Diff data have one less value
        distances = [metric_dict[distance_metric](x.values,X_test.values[i,1:],**metric_args) for idx,x in train_X.iterrows()]

    sorted_indices = np.argsort(distances)
    if neighbor == 1:
        pred = y[sorted_indices[:neighbor]][0]
    else:
        pred = np.bincount((y).astype(np.int64)[sorted_indices[:neighbor]]).argmax()
    preds.append(pred)    

In [23]:
print("Best params : ",params_dict)
print("Accuracy from best params :", sklearn.metrics.accuracy_score(y_test,preds))


Best params :  {'Metric': 'lcss', 'DTW Type': 'eps', 'Constraint Value': 3, 'K': 1, 'Data Type': 'Original'}
Accuracy from best params : 0.02
