# Content

Time series clustering using K means with Euclidean and DTW distance

In [None]:
%matplotlib inline
from tslearn.clustering import silhouette_score

import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [25, 8]

from tslearn.clustering import TimeSeriesKMeans
import pandas as pd
import matplotlib
import json


# Preparing the data

In [None]:
data = pd.read_csv('./data/data_long.csv')

def listify(row):
    row = str(row).replace('[', '').replace(']', '').split(',')
    row = [float(y) for y in row]

    return np.asarray(row)

data.loc[:, 'points'] = data.loc[:, 'points'].apply(listify)
points = data['points']

In [None]:

points = np.array(points.values.tolist())
points.resize(points.shape[0], 900, 1)
print(points.shape)

In [None]:
seed = 10
np.random.seed(seed)
X_train = points
sz = X_train.shape[1]

In [None]:

print(np.mean(X_train[1,:]))
print(np.std(X_train[1,:]))

#### Looking at some of the patterns

In [None]:
for yi in range(12):
    plt.subplot(4, 3, yi + 1)
    plt.plot(X_train[yi].ravel(), "k-", alpha=.2)

## Training the model and saving the results (saved in /analysis/plots/cluster/).
  * Using Euclidean distance

In [None]:
# Euclidean k-means

seed = 10
np.random.seed(seed)


n_clusterss = [27]

for n_clusters in n_clusterss:
    print("Euclidean k-means ---- {}".format(n_clusters))
    km = TimeSeriesKMeans(n_clusters=n_clusters, verbose=False, random_state=seed, n_jobs=-1)
    cluster_labels = km.fit_predict(X_train)

    centers = []
    for i in range(0, 27):
        centers.append(list(km.cluster_centers_[i].flatten()))

    pd.DataFrame(np.asarray(centers).transpose()).to_csv("./data/cluster_centers.csv")

    print("Euclidean silhoutte: {:.2f}".format(silhouette_score(X_train, cluster_labels, metric="euclidean")))

    # Plot each average curve independently:
    for yi in range(n_clusters):
        plt.plot(km.cluster_centers_[yi].ravel(), "r-", linewidth=40)
        plt.tight_layout()

        plt.xlim(0, sz)
        plt.ylim(0, 100)
        plt.axis('off')

        fig = matplotlib.pyplot.gcf()
        fig.set_size_inches(18.5, 18.5)

        plt.savefig("./plots/cluster/cluster_{}_{}.png".format(yi, n_clusters))
        plt.close()

    fig = plt.figure()
    for yi in range(n_clusters):
        ax = plt.subplot(9, 3, yi + 1)
        for xx in X_train[cluster_labels == yi]:
            plt.plot(xx.ravel(), "k-", alpha=.2)
        plt.plot(km.cluster_centers_[yi].ravel(), "r-")

        plt.xlim(0, sz)
        plt.ylim(0, 100)
        plt.text(0.37, 1.05,'Cluster %d' % (yi + 1),
                 transform=plt.gca().transAxes, fontsize=20, fontweight='bold')

        print("Cluster {} done!".format(yi))

    data['cluster_labels'] = cluster_labels
    data.to_csv("./data/data_long_cluster_{}.csv".format(str(n_clusters)))
    plt.tight_layout(pad=1.2, h_pad=1.5)

    fig = matplotlib.pyplot.gcf()
    fig.set_size_inches(10, 20)
    
    fig.savefig("./plots/cluster/k_means_{}.png".format(str(n_clusters)), dpi=500)
    plt.close()

#### Convert cluster centers to polynomials

In [None]:
def get_poly(row, i):
    #plot_lines(row, "cluster_{}_{}".format(i, np.mean(row)), 0, 0)
    xs = np.linspace(0, 90, num=900)
    return json.dumps(list(np.polyfit(xs, row, 68)))

cluster_centers = pd.read_csv('./data/cluster_centers.csv')
cluster_polynomials = {}

for i in range(0, 27):
    cluster_polynomials[i] = get_poly(np.asarray(cluster_centers[str(i)]), i)
    print(i)

cp_pd = pd.DataFrame.from_dict(cluster_polynomials, orient='index')
cp_pd.to_csv('./data/cluster_polynomials.csv')