In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sktime # !pip install sktime
import sklearn
from pyts.approximation import PiecewiseAggregateApproximation
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RepeatedStratifiedKFold
from tslearn import metrics 
from datetime import date, time
from importlib import reload
from sklearn.neighbors import KNeighborsClassifier
from scipy.spatial.distance import pdist,squareform
import itertools 
import os

In [2]:
def euclid_dist(t1,t2):
    return np.sqrt(np.sum((t1-t2)**2))

In [3]:
metric_dict = {
    "lcss"  : metrics.lcss,
    "gak" : metrics.gak,
    "dtw" : metrics.dtw,
    "euclidian" : euclid_dist
}

In [4]:
PowerCons_TRAIN = pd.read_csv("PowerCons/PowerCons_TRAIN.txt",delim_whitespace=True,header=None)
PowerCons_TRAIN.columns = ["target"] + [i for i in np.arange(1,PowerCons_TRAIN.shape[1])]

In [5]:
PowerCons_TEST = pd.read_csv("PowerCons/PowerCons_TEST.txt",delim_whitespace=True,header=None)
PowerCons_TEST.columns = ["target"] + [i for i in np.arange(1,PowerCons_TEST.shape[1])]

In [6]:
global_constraints = ["sakoe_chiba","itakura"]
sakoe_chiba_radius_values = [10]
itakura_max_slope_values = [10]
lcss_eps = [3,10]
sigma_values = [1,2]
k_neighbor = [1,3,5]
n_split = 10
n_repeat = 5
combinations = list(itertools.product(np.arange(n_repeat), np.arange(n_split)))
rskf = RepeatedStratifiedKFold(n_splits=n_split, n_repeats=n_repeat,
    random_state=42)
data_dict = ["Original","DT","Diff"]

In [7]:
y = PowerCons_TRAIN["target"].values
X =  PowerCons_TRAIN.drop("target",1).values

In [8]:
folder_name =  "indices_for_PowerCons"
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

counter = 0
for train_index, test_index in rskf.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)

    np.save(f"{folder_name}/{combinations[counter][0]}_{combinations[counter][1]}_train_indices.npy",train_index)
    np.save(f"{folder_name}/{combinations[counter][0]}_{combinations[counter][1]}_val_indices.npy",test_index)

    counter += 1

TRAIN: [  0   1   2   3   5   6   7   8   9  10  11  12  13  14  16  17  18  19
  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37
  38  39  40  41  42  43  44  45  46  47  48  49  50  51  53  54  55  56
  58  59  60  61  63  64  65  66  67  69  70  71  72  73  74  75  76  78
  79  81  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98
  99 100 101 103 104 105 106 107 108 109 110 112 113 114 115 116 117 118
 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 138
 139 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
 158 159 160 161 162 163 164 165 167 168 170 171 172 173 174 175 176 179] TEST: [  4  15  52  57  62  68  77  80  82 102 111 119 137 140 166 169 177 178]
TRAIN: [  0   1   2   3   4   5   6   7   9  10  11  12  13  15  16  17  18  19
  20  21  22  23  24  26  27  28  29  30  31  32  33  34  36  37  39  40
  41  42  43  44  46  47  48  49  50  51  52  53  54  55  57  58  59  61
  62  63  64  66  67  68  69 

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  23  24  25  27  28  29  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  45  46  47  48  50  51  52  53  55  56  57  58
  59  61  62  63  64  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  97
  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
 116 117 118 119 120 121 123 124 125 126 127 128 129 130 131 132 133 134
 135 136 138 139 140 141 142 143 146 148 149 150 151 154 155 157 158 159
 160 161 163 164 165 166 167 169 170 171 172 173 174 175 176 177 178 179] TEST: [ 22  26  30  49  54  60  65  96 122 137 144 145 147 152 153 156 162 168]
TRAIN: [  0   1   2   3   4   6   7   8   9  10  11  12  13  15  16  17  18  19
  20  21  22  23  24  26  27  28  29  30  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  47  48  49  50  51  52  53  54  55  56  57  58
  59  60  61  63  64  65  66 

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  12  13  14  15  16  17  18
  19  20  21  22  23  24  26  27  28  29  30  31  33  34  35  36  38  39
  40  41  42  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58
  59  61  62  63  64  65  66  67  68  69  70  72  74  75  76  77  78  79
  81  82  83  84  85  86  87  88  91  92  93  94  95  96  97 100 101 102
 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
 159 160 161 162 163 164 165 166 167 168 169 172 173 175 176 177 178 179] TEST: [ 11  25  32  37  43  60  71  73  80  89  90  98  99 121 122 170 171 174]
TRAIN: [  1   2   3   4   5   6   7   8  10  11  12  13  14  15  16  17  19  20
  21  22  23  24  25  27  28  29  30  31  32  33  34  35  36  37  38  39
  41  42  43  44  45  46  48  49  50  51  52  53  54  55  56  57  58  60
  61  62  63  64  65  66  67 

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  20  21  23  24  26  27  28  29  30  31  32  33  34  35  36  37  38
  39  40  41  42  43  44  45  46  47  48  49  51  52  53  54  55  56  57
  58  59  60  61  62  64  65  66  67  68  69  70  71  72  73  74  75  76
  77  78  79  80  81  82  84  85  87  88  89  90  91  92  93  95  97  98
  99 100 101 102 103 104 105 106 107 110 111 112 113 114 116 118 119 120
 121 122 123 124 125 126 127 128 129 131 132 133 134 135 136 137 139 140
 141 142 143 144 145 146 147 148 150 151 152 153 154 155 156 157 158 159
 160 162 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179] TEST: [ 19  22  25  50  63  83  86  94  96 108 109 115 117 130 138 149 161 163]
TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  15  16  17  19
  21  22  23  24  25  26  27  28  29  30  31  32  35  36  37  38  39  40
  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  67  69 

In [9]:

result_all = []
hyper_all = []
    
for params in [["lcss","eps"],["dtw","itakura"],["dtw","sakoe_chiba"],["gak","sigma"],["euclidian","No"]]:
    distance_metric = params[0]
    constraint = params[1]
    #print(distance_metric,constraint)
    if constraint == "eps":
        constraint_value_array = lcss_eps
    
    if constraint == "sigma":
        constraint_value_array = sigma_values
    elif constraint == "itakura":
        constraint_value_array = itakura_max_slope_values
    elif constraint == "sakoe_chiba":
        constraint_value_array = sakoe_chiba_radius_values
    elif constraint == "No":
        constraint_value_array = ["Euclidian"]

    for constraint_value in constraint_value_array:
        
        for neighbor in k_neighbor:
            fea_columns = list(PowerCons_TRAIN.columns[1:])
            train_ts_data = PowerCons_TRAIN
            reg_data = PowerCons_TRAIN[fea_columns].T.reset_index(drop=True)
            dt_data = []
            for col in reg_data.columns:
                reg = DecisionTreeRegressor(min_samples_split = 3, min_samples_leaf = 2, max_depth= 3,ccp_alpha=0)
                reg_data_ex = reg_data[col].reset_index()
                reg_data_ex.columns = ["time","ts"]
                dt = reg.fit(reg_data_ex[["time"]],reg_data_ex.ts).predict(reg_data_ex[["time"]])
                dt_data.append(dt)
           
            train_ts_dt_data = pd.DataFrame(np.stack(dt_data),columns = fea_columns,index=PowerCons_TRAIN[fea_columns].index)
            train_ts_diff_data = PowerCons_TRAIN[fea_columns].diff(axis=1).dropna(axis=1)

            for i,ts_data_type in enumerate([train_ts_data,train_ts_dt_data,train_ts_diff_data]):

                hyper_all.append([distance_metric,constraint,constraint_value,neighbor,data_dict[i]])
                print("Data Type", data_dict[i], " Constraint: " ,constraint," Constraint Value: ",constraint_value,"# of Neighbors: ",neighbor)

                if i!=2:
                    X = ts_data_type[fea_columns]
                else:
                    X = ts_data_type[fea_columns[1:]]
                y = train_ts_data["target"].replace(-1,2)
                accuracy_list = []
                if constraint=="sigma":
                    metric_args = {"sigma": constraint_value}
                elif constraint == "eps":
                    metric_args = {"eps": constraint_value}
                elif constraint == "itakura":
                    metric_args = {"global_constraint": constraint,"itakura_max_slope":constraint_value}

                elif constraint == "sakoe_chiba":
                    metric_args = {"global_constraint": constraint,"sakoe_chiba_radius":constraint_value}
                else:
                    metric_args = {}

                dm = pdist(X, lambda x,y:  metric_dict[distance_metric](x,y,**metric_args))
                distance_matrix = squareform(dm)
                np.fill_diagonal(distance_matrix,np.inf)

                for comb in combinations:
                    train_index = np.load(f"{folder_name}/{comb[0]}_{comb[1]}_train_indices.npy")
                    test_index = np.load(f"{folder_name}/{comb[0]}_{comb[1]}_val_indices.npy")
                    X_train = X.iloc[train_index]
                    X_val = X.iloc[test_index]
                    y_train = y.iloc[train_index].values.astype(np.int64)
                    y_test = y.iloc[test_index].values.astype(np.int64)
                     
                    preds = []
                    for test_idx in test_index:
                        sorted_indices = np.argsort(distance_matrix[test_idx][train_index])
                        if neighbor == 1:
                            pred = y_train[sorted_indices[:neighbor]][0]
                        else:
                            pred = np.bincount((y_train).astype(np.int64)[sorted_indices[:neighbor]]).argmax()
                        preds.append(pred)

                    accuracy = sklearn.metrics.accuracy_score(y_test,preds)
                    accuracy_list.append([comb[0],comb[1],accuracy])
                res_df = pd.DataFrame(accuracy_list,columns = ["Repeat Id","Split Id","Accuracy"])
                #res_df.Accuracy.plot.box()
                #plt.title("Accuracy Distribution for Different Folds\n" + f" distance metric: {distance_metric}, K (# of neighbors) : {neighbor}, Data Type : {data_dict[i]}")
                #plt.show()

                print("Mean Accuracy is : ", round(100*res_df.Accuracy.mean(),2))
                print("Std Accuracy is : ", round(100*res_df.Accuracy.std(),2))
                print("="*50)
                result_all.append([round(100*res_df.Accuracy.mean(),2),round(100*res_df.Accuracy.std(),2)])



Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  33.78
Std Accuracy is :  7.6
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  36.44
Std Accuracy is :  7.29
Data Type Diff  Constraint:  eps  Constraint Value:  3 # of Neighbors:  1
Mean Accuracy is :  48.56
Std Accuracy is :  8.97
Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  31.0
Std Accuracy is :  8.55
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  34.67
Std Accuracy is :  7.82
Data Type Diff  Constraint:  eps  Constraint Value:  3 # of Neighbors:  3
Mean Accuracy is :  48.22
Std Accuracy is :  9.75
Data Type Original  Constraint:  eps  Constraint Value:  3 # of Neighbors:  5
Mean Accuracy is :  27.22
Std Accuracy is :  10.42
Data Type DT  Constraint:  eps  Constraint Value:  3 # of Neighbors:  5
Mean Accuracy is :  31.44
Std Accuracy is :  7.66
Dat

Mean Accuracy is :  26.44
Std Accuracy is :  8.37
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  1
Mean Accuracy is :  50.0
Std Accuracy is :  0.0
Data Type Original  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  32.89
Std Accuracy is :  8.08
Data Type DT  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  30.0
Std Accuracy is :  8.09
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  3
Mean Accuracy is :  49.78
Std Accuracy is :  1.1
Data Type Original  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  29.33
Std Accuracy is :  8.17
Data Type DT  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  26.78
Std Accuracy is :  8.82
Data Type Diff  Constraint:  sigma  Constraint Value:  2 # of Neighbors:  5
Mean Accuracy is :  49.11
Std Accuracy is :  2.34
Data Type Original  Constraint:  No  Constraint Value:  Euclidian # 

In [17]:
results = pd.concat([pd.DataFrame(hyper_all,columns = ["Metric","DTW Type","Constraint Value","K","Data Type"]),pd.DataFrame(result_all,columns = ["Mean Acc","Std Acc"])],1)


In [18]:
results.sort_values(["Mean Acc","Std Acc"],ascending=[False,True]).head(10)

Unnamed: 0,Metric,DTW Type,Constraint Value,K,Data Type,Mean Acc,Std Acc
57,euclidian,No,Euclidian,3,Original,97.22,3.41
60,euclidian,No,Euclidian,5,Original,97.22,3.59
58,euclidian,No,Euclidian,3,DT,95.78,3.82
54,euclidian,No,Euclidian,1,Original,95.44,4.3
61,euclidian,No,Euclidian,5,DT,95.44,4.45
33,dtw,sakoe_chiba,10,5,Original,95.33,3.78
28,dtw,sakoe_chiba,10,1,DT,95.33,5.19
30,dtw,sakoe_chiba,10,3,Original,95.11,4.71
27,dtw,sakoe_chiba,10,1,Original,95.11,4.97
55,euclidian,No,Euclidian,1,DT,95.0,5.41


In [19]:
best_params = results.head(1).values[0][:-2]

In [20]:
best_params

array(['lcss', 'eps', 3, 1, 'Original'], dtype=object)

In [21]:
params_dict = dict(zip(["Metric","DTW Type","Constraint Value","K","Data Type"],best_params))


In [22]:
params_dict

{'Metric': 'lcss',
 'DTW Type': 'eps',
 'Constraint Value': 3,
 'K': 1,
 'Data Type': 'Original'}

In [23]:
### test data prediction with best parameters
print("="*40)
X_test = PowerCons_TEST[fea_columns] 
y_test = PowerCons_TEST["target"].values
X  = PowerCons_TRAIN[fea_columns] 
y  = PowerCons_TRAIN["target"].values



In [24]:
reg_data = PowerCons_TRAIN[fea_columns] .T.reset_index(drop=True)

In [25]:
dt_data = []
for col in reg_data.columns:
    reg = DecisionTreeRegressor(min_samples_split = 3, min_samples_leaf = 2, max_depth= 3,ccp_alpha=0)
    reg_data_ex = reg_data[col].reset_index()
    reg_data_ex.columns = ["time","ts"]
    dt = reg.fit(reg_data_ex[["time"]],reg_data_ex.ts).predict(reg_data_ex[["time"]])
    dt_data.append(dt)
train_ts_dt_data = pd.DataFrame(np.stack(dt_data),columns = fea_columns,index=train_ts_data.index)
train_ts_diff_data = train_ts_data[fea_columns].diff(axis=1).dropna(axis=1)


In [26]:
constraint = params_dict["Constraint Value"]
distance_metric = params_dict["Metric"]
neighbor = params_dict["K"]
data_type = params_dict["Data Type"]

if constraint=="gamma":
    metric_args = {"g": constraint_value}
elif constraint == "eps":
    metric_args = {"eps": constraint_value}
elif constraint == "itakura":
    metric_args = {"global_constraint": constraint,"itakura_max_slope":constraint_value}

elif constraint == "sakoe_chiba":
    metric_args = {"global_constraint": constraint,"sakoe_chiba_radius":constraint_value}
else:
    metric_args = {}

In [27]:
preds = []

if data_type == "Original":
    train_X = X
elif data_type == "DT":
    train_X = train_ts_dt_data
elif data_type == "Diff":
    train_X = train_ts_diff_data
for i in range(X_test.shape[0]):    
    if data_type in ["Original","DT"]:
        distances = [metric_dict[distance_metric](x.values,X_test.values[i,:],**metric_args) for idx,x in train_X.iterrows()]
    else: # Diff data have one less value
        distances = [metric_dict[distance_metric](x.values,X_test.values[i,1:],**metric_args) for idx,x in train_X.iterrows()]

    sorted_indices = np.argsort(distances)
    if neighbor == 1:
        pred = y[sorted_indices[:neighbor]][0]
    else:
        pred = np.bincount((y).astype(np.int64)[sorted_indices[:neighbor]]).argmax()
    preds.append(pred)    

In [28]:
print("Best params : ",params_dict)
print("Accuracy from best params :", sklearn.metrics.accuracy_score(y_test,preds))


Best params :  {'Metric': 'lcss', 'DTW Type': 'eps', 'Constraint Value': 3, 'K': 1, 'Data Type': 'Original'}
Accuracy from best params : 0.13333333333333333
