# Implement Dynamic Time Warping Baseline

https://tslearn.readthedocs.io/en/stable/variablelength.html#clustering

https://tslearn.readthedocs.io/en/stable/user_guide/clustering.html#clustering

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
#CB: Pickle is used for serializing and de-serializing Python object structures
import pickle


def clean_plot():
    #CB: 111 = 1 number of rows, 1 number of columns, 1 index of the subplot
    ax = plt.subplot(111)    
    #CB: the rest is for changing the appearance
    ax.spines["top"].set_visible(False)    
    ax.spines["bottom"].set_visible(False)    
    ax.spines["right"].set_visible(False)    
    ax.spines["left"].set_visible(False)    
    
    ax.get_xaxis().tick_bottom()    
    ax.get_yaxis().tick_left()   
    plt.grid()

#CB: Backend Management, to change the default values and styling
import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
         'axes.labelsize': 'x-large',
         'axes.titlesize':'x-large',
         'xtick.labelsize':'x-large',
         'ytick.labelsize':'x-large'}
pylab.rcParams.update(params)

from tslearn.clustering import TimeSeriesKMeans, KernelKMeans
from tslearn.utils import to_time_series_dataset
from sklearn.metrics import adjusted_rand_score

  "Scikit-learn <0.24 will be deprecated in a "


In [2]:
pip install pyro-ppl

Note: you may need to restart the kernel to use updated packages.


In [3]:
import Pyro4 as pyro

In [4]:
import logging

#CB: PyTorch is an open source machine learning framework based on the Torch library, 
#used for applications such as computer vision and natural language processing, primarily developed 
#by Meta AI.
import torch
import torch.nn as nn
import torch.nn.functional as F

from pyro.distributions import MultivariateNormal, Normal, Independent

from sklearn.cluster import KMeans, SpectralClustering
from sklearn.metrics import adjusted_rand_score

#CB: SciPy is a free and open-source Python library used for scientific computing and technical computing.
import scipy
from scipy.sparse import csgraph
from scipy.sparse.linalg import eigsh


In [5]:
import argparse
import os
import sys
sys.path.append('../data')
sys.path.append('../plot')
from load import sigmoid, quadratic, chf, parkinsons, load_data_format, load_synthetic_data, load_sigmoid_data
from data_utils import parse_data, change_missing

from plot_utils import plot_subtypes, plot_latent

data_format_num = 11

max_visits      = 17
num_output_dims = 3
use_sigmoid     = True
epochs          = 100

# Change configurations here

# DOES NOT work
# C, ds, dh, drnn, reg_type, lr = 0.0, 20, 30, 150, 'l1', 0.1

# DOES work
C, ds, dh, drnn, reg_type, lr = 0.0, 20, 30, 150, 'l1', 0.001

configs = (C, ds, dh, drnn, reg_type, lr)

data    = load_data_format(data_format_num, trial_num=0, cache=True)

shuffle = False

train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                         max_visits=max_visits, 
                                                                                                                         test_per=0.2, valid_per=0.2, 
                                                                                                                         shuffle=shuffle)
data_loader, collect_dict, unique_pid = parse_data(data.values, max_visits=max_visits)


Max visits: 17
Max visits: 17


## Bring in dynamic time warping!

In [6]:
import sys
sys.path.append('../model')
from utils import interpolate

In [7]:
%%time

# for trial_num in range(5):
data_format_num = 12


results = []

for trial_num in range(1,6):
    data    = load_data_format(data_format_num, trial_num=trial_num)
    shuffle = False

    train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                             max_visits=max_visits, 
                                                                                                                             test_per=0.2, valid_per=0.2, 
                                                                                                                             shuffle=shuffle)

    km = KernelKMeans(n_clusters=2)
    metric = 'softdtw'
    how_impute = 'mrnn'

    km = TimeSeriesKMeans(n_clusters=2, metric=metric)

    X_train = train_data_dict['Y_collect']
    X_test  = test_data_dict['Y_collect']

    M_train = train_data_dict['mask_collect']
    T_train = train_data_dict['obs_t_collect']
    
    M_test  = test_data_dict['mask_collect']
    T_test  = test_data_dict['obs_t_collect']

    X_train = np.concatenate([X_train,train_data_dict['obs_t_collect']],axis=2)

    y_train = train_data_dict['s_collect']

    X_test = np.concatenate([X_test,test_data_dict['obs_t_collect']],axis=2)
    y_test = test_data_dict['s_collect']

    km.fit(X_train)
    labels_test = km.predict(X_test)

    test_ari = adjusted_rand_score(np.squeeze(y_test), labels_test)
    print('Test ARI: %.4f' % test_ari)
    results.append(test_ari)

print('Data %d, %s, %s: %.3f $\\pm$ %.3f' % (data_format_num, metric, how_impute, np.mean(results), np.std(results)))

Max visits: 17
Test ARI: -0.0032
Max visits: 17
Test ARI: -0.0001
Max visits: 17
Test ARI: -0.0034
Max visits: 17
Test ARI: 0.0207
Max visits: 17
Test ARI: 0.0031
Data 12, softdtw, mrnn: 0.003 $\pm$ 0.009
CPU times: user 2min 16s, sys: 695 ms, total: 2min 16s
Wall time: 2min 18s


In [None]:
#CB: Where are these numbers from and why are there numbers and not the results used from last cell?
results = np.array([[0.960, 0.098, 0.457],
 [0.980, -0.096, 0.535],
 [1.000, 0.170, 0.447 ]])

In [None]:
for i, j in zip(results.mean(axis=0), results.std(axis=0)):
    print('%.3f $\\pm$ %.3f' % (i,j),)

In [None]:
#CB: check if there are any empty values
np.isnan(X_train).any()

In [None]:
#CB: fill empyt values with the mean
def fill_nan_with_mean(a):
    a[a == -1000] = None
    return np.where(np.isnan(a), np.ma.array(a, mask=np.isnan(a)).mean(axis=0), a)

In [None]:
#CB: execute last function
fill_nan_with_mean(X_train)

In [None]:
np.linspace(3,100,20)

In [None]:
np.zeros(20)

In [None]:
# time series kmeans
import time

start = time.time()

#CB: import class Sublign
sys.path.append('../model')
from models import Sublign

# for trial_num in range(5):
trial_num = 0
epochs          = 1000

N_visit_options = [2,4,6,8,10,12,14,16,18,20]
results_data = np.zeros((len(N_visit_options), 4))

for visits_ix, N_visits_float in enumerate(N_visit_options):
    end = time.time()
    print(visits_ix,end - start)
    
    N_visits = int(N_visits_float)

    
    data = load_sigmoid_data(subtypes=2, F=3, N=1000, M=N_visits, noise=0.25)

    shuffle = False

    train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                           max_visits=N_visits, 
                                                                                                                             test_per=0.2, valid_per=0.2, 
                                                                                                                           shuffle=shuffle)
    
    model = Sublign(ds, dh, drnn, C, num_output_dims, sigmoid=use_sigmoid, reg_type=reg_type, auto_delta=False, 
                max_delta=5, learn_time=True)

    model.fit(train_data_loader, test_data_loader, epochs, lr, verbose=False, 
              fname='data%d.pt' % (data_format_num), eval_freq=25, epoch_debug=False, 
              plot_debug=False)


    subtypes = model.get_subtypes(train_data_dict['obs_t_collect'], train_data_dict['Y_collect'], K=2)

    test_results  = model.score(train_data_dict, test_data_dict)
    results_data[visits_ix,3] = test_results['ari']
    
    for metric_num, metric in enumerate(['softdtw', 'dtw', 'euclidean']):

        km = TimeSeriesKMeans(n_clusters=2, metric=metric)

        X_train = np.concatenate([train_data_dict['Y_collect'],train_data_dict['obs_t_collect']],axis=2)
        y_train = train_data_dict['s_collect']

        X_test = np.concatenate([test_data_dict['Y_collect'],test_data_dict['obs_t_collect']],axis=2)
        y_test = test_data_dict['s_collect']

        km.fit(X_train)
        labels_test = km.predict(X_test)

        test_ari = adjusted_rand_score(np.squeeze(y_test), labels_test)

        results_data[visits_ix,metric_num] = test_ari

In [None]:
N_visit_options

In [None]:
#CB: Visualisation
clean_plot()


xs = N_visit_options

for metric_num, metric in enumerate(['softdtw', 'dtw', 'euclidean', 'sublign']):
    ys = results_data[:,metric_num]
    plt.plot(xs[:7], ys[:7], label=metric)

plt.xlabel('Visits Per Person')
plt.ylabel('Held-out Cluster Performance (ARI)')
plt.legend()
plt.show()

In [None]:
#CB: @Q why just random values or where are they from? Also there never used again, so maybe just a check of sorts?
kmeans_mean = np.mean([0.0205, 0.0072, 0.0050, -0.0046, -0.0025])
kmeans_std = np.std([0.0205, 0.0072, 0.0050, -0.0046, -0.0025])

print('%.4f $\\pm$ %.2f' % (kmeans_mean, kmeans_std))

In [None]:
labels_test.shape

## Compare for higher percentage of missingness

In [None]:

data = load_data_format(13, trial_num=1)
    
train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                     max_visits=max_visits, 


In [None]:


# time series kmeans
import time

start = time.time()

# for trial_num in range(5):
trial_num = 0
epochs          = 1000

#CB: different options here
N_visit_options = [1,1,1,1]
results_data = np.zeros((len(N_visit_options), 4))

for visits_ix, N_visits_float in enumerate(N_visit_options):
    end = time.time()
    print(visits_ix,end - start)
    
    N_visits = int(N_visits_float)


    data    = load_data_format(data_format_num=11, trial_num=trial_num, cache=True)
    shuffle = False

    train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                           max_visits=N_visits, 
                                                                                                                             test_per=0.2, valid_per=0.2, 
                                                                                                                           shuffle=shuffle)

    
    for metric_num, metric in enumerate(['softdtw', 'dtw', 'euclidean']):

        km = TimeSeriesKMeans(n_clusters=2, metric=metric)

        
        X_train = np.concatenate([train_data_dict['Y_collect'],train_data_dict['obs_t_collect']],axis=2)
        y_train = train_data_dict['s_collect']

        X_test = np.concatenate([test_data_dict['Y_collect'],test_data_dict['obs_t_collect']],axis=2)
        y_test = test_data_dict['s_collect']


        km.fit(X_train)
        labels_test = km.predict(X_test)

        test_ari = adjusted_rand_score(np.squeeze(y_test), labels_test)

        results_data[visits_ix,metric_num] = test_ari

In [None]:
results_data

In [None]:
y_train.shape

## PPMI DTW results?

In [None]:
#CB: import parkinson dataset
sys.path.append('../data')
from load import parkinsons

In [None]:
#CB: repeat methods, but this time on the parkinson dataset

# time series kmeans
import time

start = time.time()

# for trial_num in range(5):
trial_num = 0
epochs          = 1000


N_visit_options = [1,1,1,1]
results_data = np.zeros((len(N_visit_options), 4))

data = parkinsons()
shuffle = False

train_data_loader, train_data_dict, _, _, test_data_loader, test_data_dict, valid_pid, test_pid, unique_pid = parse_data(data.values, 
                                                                                                                       max_visits=N_visits, 
                                                                                                                         test_per=0.2, valid_per=0.2, 
                                                                                                                       shuffle=shuffle)


for visits_ix, N_visits_float in enumerate(N_visit_options):
    end = time.time()
    print(visits_ix,end - start)
    
    N_visits = int(N_visits_float)

    
    for metric_num, metric in enumerate(['softdtw', 'dtw', 'euclidean']):

        km = TimeSeriesKMeans(n_clusters=2, metric=metric)

        
        X_train = np.concatenate([train_data_dict['Y_collect'],train_data_dict['obs_t_collect']],axis=2)
        y_train = train_data_dict['s_collect']

        X_test = np.concatenate([test_data_dict['Y_collect'],test_data_dict['obs_t_collect']],axis=2)
        y_test = test_data_dict['s_collect']


        km.fit(X_train)
        labels_test = km.predict(X_test)

        test_ari = adjusted_rand_score(np.squeeze(y_test), labels_test)

        results_data[visits_ix,metric_num] = test_ari
print(results_data)

## Run DTW that works

In [None]:

import numpy
import matplotlib.pyplot as plt
import matplotlib.colors

from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.barycenters import softdtw_barycenter
from tslearn.datasets import CachedDatasets


def row_col(position, n_cols=5):
    idx_row = (position - 1) // n_cols
    idx_col = position - n_cols * idx_row - 1
    return idx_row, idx_col


def get_color(weights):
    baselines = numpy.zeros((4, 3))
    weights = numpy.array(weights).reshape(1, 4)
    for i, c in enumerate(["r", "g", "b", "y"]):
        baselines[i] = matplotlib.colors.ColorConverter().to_rgb(c)
    return numpy.dot(weights, baselines).ravel()


numpy.random.seed(0)
X_train, y_train, X_test, y_test = CachedDatasets().load_dataset("Trace")
X_out = numpy.empty((4, X_train.shape[1], X_train.shape[2]))


#CB: Visualisation
plt.figure()
for i in range(4):
    X_out[i] = X_train[y_train == (i + 1)][0]
X_out = TimeSeriesScalerMinMax().fit_transform(X_out)

for i, pos in enumerate([1, 5, 21, 25]):
    plt.subplot(5, 5, pos)
    w = [0.] * 4
    w[i] = 1.
    plt.plot(X_out[i].ravel(),
             color=matplotlib.colors.rgb2hex(get_color(w)),
             linewidth=2)
    plt.text(X_out[i].shape[0], 0., "$X_%d$" % i,
             horizontalalignment="right",
             verticalalignment="baseline",
             fontsize=24)
    plt.xticks([])
    plt.yticks([])


plt.tight_layout()
plt.show()


In [None]:
X_out.shape

In [None]:
y_train.shape

In [None]:
REDACTED = softdtw_barycenter(X=X_out, weights=w).ravel()

In [None]:
REDACTED.shape