In [None]:
# Author: Romain Tavenard
# License: BSD 3 clause
!pip install tslearn

In [None]:
import numpy
import pandas as pd
import matplotlib.pyplot as plt

from tslearn.clustering import TimeSeriesKMeans
from tslearn.datasets import CachedDatasets
from tslearn.preprocessing import TimeSeriesScalerMeanVariance, \
    TimeSeriesResampler

In [None]:
df = pd.read_parquet('./data/sofa_hourly.parquet', )

In [None]:
df['sofa_24hours'].hist()

In [None]:
df = df[df['hr'] <= 72]
df_t = df.pivot(index='stay_id', columns = 'hr', values = 'sofa_24hours')
df_t = df_t.dropna()

from tslearn.utils import to_time_series_dataset
to_time_series_dataset(df_t)



In [None]:
from tslearn.clustering import TimeSeriesKMeans
km3 = TimeSeriesKMeans(n_clusters=3, metric="dtw")
km3.fit(df_t)

In [None]:
result = pd.DataFrame({'A':df_t[km3.predict(df_t)==0].mean(axis=0),
                       'B':df_t[km3.predict(df_t)==1].mean(axis=0),
                       'C':df_t[km3.predict(df_t)==2].mean(axis=0)})
result

In [None]:
import matplotlib.pyplot as mp
%config InlineBackend.figure_format = 'svg'

# plot multiple columns such as population and year from dataframe
result.plot(y=["A", "B","C"],
        kind="line", figsize=(10, 10))
# display plot

mp.savefig("kmeans_72hrs_3cluster.svg")
mp.show()

In [None]:
km4 = TimeSeriesKMeans(n_clusters=4, metric="dtw")
km4.fit(df_t)

In [None]:
result = pd.DataFrame({'A':df_t[km4.predict(df_t)==0].mean(axis=0),
                       'B':df_t[km4.predict(df_t)==1].mean(axis=0),
                       'C':df_t[km4.predict(df_t)==2].mean(axis=0),
                       'D':df_t[km4.predict(df_t)==3].mean(axis=0),})
result

In [None]:
mp.clf()

%config InlineBackend.figure_format = 'svg'


# plot multiple columns such as population and year from dataframe
result.plot(y=["A", "B","C", "D"],
        kind="line", figsize=(10, 10))
# display plot
mp.savefig("kmeans_72hrs_4cluster.svg")
mp.show()


In [None]:
km4.inertia_

In [None]:
inertia = []

temp = df_t.sample(1000)

for i in range(1, 11):
    clusters = i
    km = TimeSeriesKMeans(n_clusters=i, metric="dtw")
    km.fit(temp)
    inertia.append(km.inertia_)
    print(inertia)

In [None]:
mp.clf()
n = [1, 2, 3, 4, 5, 6, 7, 8, 9]
mp.plot(n, inertia)

In [None]:
km5 = TimeSeriesKMeans(n_clusters=5, metric="dtw")
km5.fit(df_t)

In [None]:
result = pd.DataFrame({'A':df_t[km5.predict(df_t)==0].mean(axis=0),
                       'B':df_t[km5.predict(df_t)==1].mean(axis=0),
                       'C':df_t[km5.predict(df_t)==2].mean(axis=0),
                       'D':df_t[km5.predict(df_t)==3].mean(axis=0),
                       'E':df_t[km5.predict(df_t)==4].mean(axis=0)})
result

In [None]:
mp.clf()

%config InlineBackend.figure_format = 'svg'


# plot multiple columns such as population and year from dataframe
result.plot(y=["A", "B","C", "D", "E"],
        kind="line", figsize=(10, 10))
# display plot
mp.savefig("kmeans_72hrs_5cluster.svg")
mp.show()