In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm

from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances

Some old Clustering Code for function tested for train_clinical data.
Might be usefull for future timeseries analysis

## Load Data

In [2]:
file_location="amp-parkinsons-disease-progression-prediction"
train_clinical = pd.read_csv(file_location+'/train_clinical_data.csv')

## Clustering

In [3]:
def COSSIM(f,g, eps=10**-20):
    # Similarity between two functions measured: Integral(f'g')/sqrt(Integral(f'**2)*Integral(g'**2))
    # https://en.wikipedia.org/wiki/Cosine_similarity
    f_diff = f[...,1:]-f[...,:-1]
    g_diff = g[...,1:]-g[...,:-1]
    dxfdxg = f_diff*g_diff+eps
    dxf2 = np.square(f_diff)+eps
    dxg2 = np.square(g_diff)+eps
    return np.sum(dxfdxg, axis=-1)/(np.sqrt(np.sum(dxf2, axis=-1))*np.sqrt(np.sum(dxg2, axis=-1)))

In [4]:
def metric_COSSIM(f,g):
    c = COSSIM(f,g)
    return np.sqrt(np.abs(1-c))

In [5]:
def sim_affinity(X):
    return pairwise_distances(X, metric=metric_COSSIM)

## Clustering Data

In [6]:
df_data = train_clinical.pivot(index='visit_month', columns='patient_id', values='updrs_1')
data = np.array(df_data).transpose()

# interpolating nan values, this can probably be done better ^^
data = np.array([np.interp(np.arange(len(d)), 
          np.arange(len(d))[np.isnan(d) == False], 
          d[np.isnan(d) == False]) for d in data])

data.shape

In [7]:
n_clusters = 40
distance_threshold = None
samples = 1500

cluster = AgglomerativeClustering(n_clusters=n_clusters,
                                  distance_threshold=distance_threshold,
                                  affinity=sim_affinity, linkage='average')

cluster.fit(data);

In [8]:
plt.figure(figsize=(20,20))
plt.suptitle("Clusters ")
for clus in range(max(cluster.labels_)+1):
    for index in np.argwhere(cluster.labels_==clus):           
        plt.subplot(10,4,clus+1)
        plt.ticklabel_format(axis="y", style="sci", scilimits=(0,0))
        plt.plot(df_data.index, (data[index]-np.min(data[index]))[0],color="red", alpha = .5);          
        