Ordinal Patterns implementation

In [35]:
import numpy as np
import pandas as pd 
import ordpy as ord
import os
import math
import itertools
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tqdm.notebook import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import ParameterGrid
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, confusion_matrix

EVAL_DICT = {'accuracy ' : accuracy_score, "recall" : recall_score, "f1 score" : f1_score}

plt.style.use('ggplot')

In [4]:

def evaluate_model(preds, labels):
    res = {}
    for mname, metric in EVAL_DICT.items():

        if 'f1' in mname:
            res[mname] = metric(labels, preds, average='binary')

        else:
            res[mname] = metric(labels, preds)

    return res


def display_metrics(metrics):

    for mname, value in metrics.items():
        if mname != 'confusion_matrix':
            print(f"{mname}:{value:.4f}", end="\t")
        else:
            print()
            print(mname)
            print(value)

In [62]:
def recover_data_from_df(csv_path= "../data/processed/final/train_val_test", filter= False):

    data_splits = {t.split('.')[0] : t for t in os.listdir(csv_path)}
    final_data = {key : [] for key in data_splits}
    final_targets = {key : [] for key in data_splits}
    final_names = {key : [] for key in data_splits}


    for dt, dpath in data_splits.items(): 
        final_path = os.path.join(csv_path, dpath)

        for sample, df in pd.read_csv(final_path).groupby('sample'):
            
            label = df['label'].iloc[0]
            workout = df['Workout'].iloc[0]
            df.drop(columns=['sample', 'label', 'Workout'], inplace=True)

            apn_obj = df.T.values

            final_data[dt].append(apn_obj)
            final_targets[dt].append(label)
            final_names[dt].append(workout)


        final_targets[dt] = np.array(final_targets[dt])

    return final_data, final_targets, final_names

ft, fy, fn = recover_data_from_df()

In [6]:
x = ft['train_data'][0] # uma unica amostra da serie, nao tem como fazer com todas
 

def get_op_from_ts(x, dx, dy=1, taux=10):
    
    """
    Applies the Bandt and Pompe symbolization approach to obtain 
    a probability distribution of ordinal patterns (permutations) from data, adapted from ordpy.
    
    Parameters
    ----------
    data : array 
           Array object in the shape (channels, len)
    dx : int
         Embedding dimension (horizontal axis) (default: 3).
    dy : int
         Embedding dimension (vertical axis); it must be 1 for time series 
         (default: 1).
    taux : int
           Embedding delay (horizontal axis) (default: 1).
    tauy : int
           Embedding delay (vertical axis) (default: 1).

    """

    symbols = ord.ordinal_sequence(x, dx=dx, dy= dy, taux=taux)

    if np.shape(symbols).__len__() == 3:
        symbols = symbols.reshape(-1, dx*dy) 
    else:
        pass
    
    all_symbols            = np.asarray(list(itertools.permutations(range(dx*dy))), dtype='int')
    symbols, symbols_count = np.unique(symbols, return_counts=True, axis=0)
    probabilities          = symbols_count/symbols_count.sum()

    all_symbols_str       = np.apply_along_axis(np.char.strip, 0, 
                                                np.apply_along_axis(np.array2string, 1, all_symbols, separator=''),
                                                chars="[]")

    if len(probabilities)==math.factorial(dx*dy):
        # return MinMaxScaler().fit_transform(probabilities)
        return dict(zip(all_symbols_str, probabilities))
        # return (probabilities - probabilities.min()) / (probabilities.max() - probabilities.min())
    

    all_probs             = np.full(math.factorial(dx*dy), 0.)
    dict_probs = dict(zip(all_symbols_str, all_probs))

    for symbol, probability in zip(all_symbols_str, probabilities):
        dict_probs[symbol] = probability
    
    # normalizing 

    # v = MinMaxScaler().fit_transform(np.array(dict_probs.values()).reshape(-1, 1))
    # v = (v - np.min(v)) / (np.max(v) - np.min(v))
    return dict_probs

def get_op(datalist, dx, dy=1, taux =10):
    try:
        return np.array([(list(get_op_from_ts(x, dx, dy, taux=taux).values())) for x in datalist])
    except:
        return np.array([(list(get_op_from_ts(x, dx, dy, taux=taux))) for x in datalist])



dprob = get_op_from_ts(x, 5, 1)

# plt.bar(range(len(dprob)), dprob.values())


In [7]:
xtrain, xval, xtest = get_op(ft['train_data'], 2), get_op(ft['validation_data'], 2), get_op(ft['test_data'], 2)

print(xtrain.shape)
print(xval.shape)
print(xtest.shape)

(383, 2)
(107, 2)
(55, 2)


In [13]:

def loop_clfs(xtrain, xval):

    classifiers = {"5-nn" : KNeighborsClassifier(), "SVM" : SVC(), "LogReg" : LogisticRegression(), "RF" : RandomForestClassifier()}
    res = {}


    for clf, cfoo in classifiers.items():
        
        pipe = Pipeline([('scaler', MinMaxScaler()), (clf, cfoo)])

        pipe.fit(xtrain, fy['train_data'])
        pred_val = pipe.predict(xval)

        dval = evaluate_model(pred_val, labels=fy['validation_data']) 
        res[clf] = dval

    return res


def grid_search_on_OP(params):

    ans = {"5-nn" : [], "SVM" : [], "LogReg" : [], "RF" : []}
    for p in tqdm(params):
        dx, dy, taux = p['dx'], p['dy'], p['taux']
        
        xtrain = MinMaxScaler().fit_transform(get_op(ft['train_data'], dx, dy, taux))
        xval = MinMaxScaler().fit_transform(get_op(ft['validation_data'], dx, dy, taux))
        xtest = MinMaxScaler().fit_transform(get_op(ft['test_data'], dx, dy, taux))

        t = loop_clfs(xtrain, xval) # * --> dict := (clf, clf_metrics) 
        
                
        for clf, metrics in t.items():

            metrics['dx'] = dx
            metrics['dy'] = dy
            metrics['taux'] = taux

            ans[clf].append(metrics)


        # return ans
    return {key: pd.DataFrame.from_dict(v) for key, v in ans.items()}

params = ParameterGrid(
    {'dx' : [3, 4], 'dy' : [1, 2], 'taux' : [1, 10, 20] }
    # {'dx' : [2], 'dy' : [1], 'taux' : [1, 10] }

    
)


dict_ans = grid_search_on_OP(params)


        

  0%|          | 0/12 [00:00<?, ?it/s]

In [14]:
for key, value in dict_ans.items():
    print(key)
    display(value)


5-nn


Unnamed: 0,accuracy,recall,f1 score,dx,dy,taux
0,0.71028,0.520833,0.617284,3,1,1
1,0.672897,0.520833,0.588235,3,1,10
2,0.588785,0.5625,0.55102,3,1,20
3,0.71028,0.520833,0.617284,3,2,1
4,0.775701,0.833333,0.769231,3,2,10
5,0.738318,0.875,0.75,3,2,20
6,0.794393,0.645833,0.738095,4,1,1
7,0.785047,0.8125,0.772277,4,1,10
8,0.672897,0.479167,0.567901,4,1,20
9,0.719626,0.395833,0.558824,4,2,1


SVM


Unnamed: 0,accuracy,recall,f1 score,dx,dy,taux
0,0.71028,0.479167,0.597403,3,1,1
1,0.71028,0.479167,0.597403,3,1,10
2,0.635514,0.708333,0.635514,3,1,20
3,0.607477,0.979167,0.691176,3,2,1
4,0.925234,0.9375,0.918367,3,2,10
5,0.897196,0.916667,0.888889,3,2,20
6,0.82243,0.666667,0.771084,4,1,1
7,0.747664,0.791667,0.737864,4,1,10
8,0.766355,0.520833,0.666667,4,1,20
9,0.448598,1.0,0.619355,4,2,1


LogReg


Unnamed: 0,accuracy,recall,f1 score,dx,dy,taux
0,0.691589,0.520833,0.60241,3,1,1
1,0.588785,0.4375,0.488372,3,1,10
2,0.616822,0.333333,0.438356,3,1,20
3,0.803738,0.75,0.774194,3,2,1
4,0.915888,0.916667,0.907216,3,2,10
5,0.850467,0.75,0.818182,3,2,20
6,0.82243,0.645833,0.765432,4,1,1
7,0.663551,0.916667,0.709677,4,1,10
8,0.728972,0.4375,0.591549,4,1,20
9,0.738318,0.833333,0.740741,4,2,1


RF


Unnamed: 0,accuracy,recall,f1 score,dx,dy,taux
0,0.71028,0.520833,0.617284,3,1,1
1,0.663551,0.520833,0.581395,3,1,10
2,0.654206,0.5625,0.593407,3,1,20
3,0.757009,0.6875,0.717391,3,2,1
4,0.775701,0.645833,0.72093,3,2,10
5,0.82243,0.854167,0.811881,3,2,20
6,0.841121,0.6875,0.795181,4,1,1
7,0.747664,0.666667,0.703297,4,1,10
8,0.691589,0.333333,0.492308,4,1,20
9,0.728972,0.666667,0.688172,4,2,1


In [16]:
xtrain[0]

array([0.50424279, 0.49575721])

In [77]:
train_ce = []

for dt in ft['train_data']:

    # entropy, complexity
    train_ce.append(ord.complexity_entropy(dt, dx=4, dy=2, taux=10))


train_ce = np.array(train_ce)

In [78]:
train_ce.shape

(383, 2)

In [None]:
colors = ['tab:green' if k == 1 else 'tab:red' for k in fy['train_data']]

#theoretical curves
hc_max_curve = ord.maximum_complexity_entropy(dx=4, dy=2).T
hc_min_curve = ord.minimum_complexity_entropy(dx=4, dy=2).T

fig, axs = plt.subplots(1, 2, figsize= (10,5))

#theoretical curves
hmin, cmin = hc_min_curve  #(this variable is defined in the cell above)
hmax, cmax = hc_max_curve  #(this variable is defined in the cell above)
plt.plot(hmin, cmin, linewidth=1.5, color="#161414", zorder=0)
plt.plot(hmax, cmax, linewidth=1.5, color="#161414", zorder=0)
plt.scatter(train_ce.T[0,:], train_ce.T[1,:], c=colors)
axs[0].scatter(train_ce.T[0,:], train_ce.T[1,:], c=colors)

