# Clustering on Time Series

In [1]:
import pandas as pd
from statistics import mean,stdev
import numpy as np
from scipy.spatial import distance
import math
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import adjusted_rand_score

## Loading the data

In [2]:
# ts_df =pd.read_csv(r"D:\Dr. Sheng Li\Datasets\UCRArchive_2018\GunPoint\sample.tsv",sep="\t")# Line 2 in table 2
# ts_df.shape
# true_labels=ts_df['0']
# true_labels=np.array(true_labels)
# true_labels

## Implementing the Algorithm

## Algorithm 1: Extracts the uShapelets from the Data

#### D: Dataset 
#### slen: Shapelet lengths(Hyperparameter)
#### Returns the uShapelets

In [3]:
def extractUShapelets(D,slen):
    ts=D.iloc[0]#copy of first time series in D
    shapeletlist=[]#List to store uShapelets
    while(True):
        sub_list=[] # list of subsequences Line 5 in Table 2
        gap=[]
        dt=[]
        for sl in slen:
            count=-1 # count 
            for j in range(0,(len(ts)-sl)+1):#line 7 in table 2
                temp=[]
                ts_copy=list(ts[j:(j+sl)])
                temp.append(ts_copy)
                sub_list.append(temp[0])#Line 8 in table 2
                temp_gap,temp_dt=computeGap(sub_list[count+1],D)
                gap.append(temp_gap)
                dt.append(temp_dt)
                count=count+1
        index1=gap.index(max(gap))
        shapeletlist.append(sub_list[index1])
        distance_array=computeDistance(sub_list[index1],D) #Line 12 in table 2

        #finding Da-Line 13 in Table 2
        Da=[]
        for value in distance_array:
            if value<dt[index1]:
                Da.append(value)
        if(len(Da)==1):#Line 14 in Table 2
            print("uShapelets Identified")
            break
        else:
            maximum=np.amax(distance_array)
            index2=np.where(distance_array==maximum)#Line 16 in Table 2
            index2=index2[0][0]
            ts=D.iloc[index2]
            criteria=np.mean(Da)+np.std(Da)
            index_list=[]
            for value in distance_array:
                if value<criteria:
                    index_to_remove=np.where(distance_array==value)
                    index_list.append(index_to_remove[0][0])
            D=D.drop(D.index[index_list])
            D=np.array(D)
            D=pd.DataFrame(D)
    S=set(tuple(row) for row in shapeletlist)
    return S

## Algorithm 2: Computes the maximum gap and dt

#### s: candidate uShapelet
#### D: Dataset
#### Returns maximum gap score and dt

In [4]:
#s is the subsequence 
#D is the Dataset
def computeGap(s,D):   
    dist=computeDistance(s,D)
    sorted_dist=np.sort(dist)#Line 2 in Table 3
    maxGap=float(0)
    dt=float(0)
    k=10#have to change later
    for l in range(0,(dist.shape[0]-2)):
        d=((sorted_dist[l]+sorted_dist[l+1])/2)
        Da,Db=find(d,dist)
        r=Da.shape[0]/Db.shape[0]
        if r>(1/k) and r<(1-(1/k)):
            Ma=mean(Da)
            Mb=mean(Db)
            sigmaA=np.std(Da)
            sigmaB=np.std(Db)
            gap=Mb-sigmaB-(Ma+sigmaA)
            if gap>maxGap:
                maxGap=gap
                dt=d
#     print("MAXGAP",maxGap)
#     print("DT",dt)
    return maxGap,dt

## Algorithm 3: Computes the Distance between candidate shapelet and dataset

#### s: Candidate uShapelet
#### D: Dataset
#### Returns the distances of all the time series with candidate shapelet

In [5]:
#s is the subsequence
#D is the dataset
def computeDistance(s,D):
    dis=np.full(D.shape[0],10000)#Line 1 in Table 4
    dis=dis.astype('float64')
    s_copy=list(s)
    s_copy=znorm(s_copy)
    for i in range(0,D.shape[0]):
        timeseries=D.loc[i]
        for j in range(0,(len(timeseries)-len(s)+1)):
            timeseries_copy=timeseries[j:(j+len(s))].copy()
            z=znorm(timeseries_copy)# Line 6 in Table 4
            d=distance.euclidean(z,s_copy)
            if float(d)<float(dis[i]):
                minimum=float(d)
            else:
                minimum=float(dis[i])
            dis[i]=float(minimum)
    return dis/math.sqrt(len(s))

### Function to perform normalization

#### ser: Series to perform normalization on
#### Returns the normalized series

In [6]:
# function to perform z-normalization
def znorm(ser):
    new_ser=[]
    u=mean(ser)
    sigma=np.std(ser)
    index=0
    for value in ser:
        new_ser.append((value-u)/sigma)
        index=index+1
    new_ser=np.array(new_ser)
    return new_ser 

### Function to find subsets distinguished by a point

#### d:point
#### dis: distance array computed by computeDistance function
#### Returns the two subsets

In [7]:
def find(d,dis):
    DA=[]
    DB=[]
    for value in range(0,dis.shape[0]):
        if dis[value]<d:
            DA.append(dis[value])
        else:
            DB.append(dis[value])
    DA=np.array(DA)
    DB=np.array(DB)
    return DA,DB

### Algorithm 4: Implements the kmeans algorithm

#### D: Dataset
#### S: set of ushapelets
#### k: number of clusters
#### Returns the cluster labels for each time series

In [8]:
#REMEBER TO SEND A NEW COPY OF THE DATASET
def clusterData(D,S,k,true_labels):
    rowsize=S.shape[0]#number of ushapelets
    colsize=D.shape[0]#number of timeseries
    inertia=[]#sumd list
    DIS=np.zeros(shape=(rowsize,colsize))
    label=[]#np.zeros(shape=(colsize,))#cls in paper
    kmeans=KMeans(n_clusters=k)
    rand_score=[]
    for i in range(0,S.shape[0]):
        dis=computeDistance(S[i],D)
        DIS[i]=dis
        sumDIS=float(10000)
        DIS_T=DIS.T
        for j in range(0,rowsize):
            kmeans.fit(DIS_T)
            inertia.append(kmeans.inertia_)
            if sum(inertia)<sumDIS:
                labels_pred=kmeans.labels_
                label.append(labels_pred)
                score=adjusted_rand_score(true_labels,labels_pred)
                rand_score.append(score)
    a=rand_score.index(max(rand_score))
    return label[a]

In [9]:
# no_of_class=max(true_labels)
# shape=[3,2]
# cluster_list=[]
# row_index=[]
# for k in range(0,no_of_class):
#     D_org=ts_df[ts_df['0']==(k+1)]
#     print(D_org)
#     row_index=list(D_org.index)
#     print(row_index)
#     D_org=D_org.reset_index(drop=True)
#     labels=D_org['0']
#     D_org=D_org.drop(D_org.columns[0],axis=1)
#     D=D_org.copy()
#     shapelist=extractUShapelets(D,shape)
#     S=np.array(list(shapelist))
#     D1=D_org.copy()
#     cluster=clusterData(D1,S,2,labels)
#     cluster_list.append(cluster)
#     break

### The three functions below identify and return the representatives of various classes of timeseries

In [10]:
def representative_selection(cluster_list,row_indices,ts_df):
    indices_list=[]
    for lis in cluster_list:
        count=5#int((len(cluster_list[0])/4))#HAVE TO CHANGE THIS
#         ind=np.where(lis==1)[0]
        indices_list.append(representative(lis,count))
    return selection(row_indices,ts_df,indices_list)

In [11]:
def selection(row_indices,ts_df,indices_list):
    rep_ts=[]
    counter=0
    for i in indices_list:
        for j in i:
            rep_ts.append(row_indices[counter][j])
        counter=counter+1
        rep=[]
    for i in rep_ts:
        temp=list(ts_df.iloc[i])
        rep.append(temp)
    rep_df=pd.DataFrame(rep)
    return rep_df

In [12]:
def representative(lis,count):
#     ind=np.where(lis==1)[0]
# #     count=int((len(cluster_list[0])/4))#HAVE TO CHANGE THIS
#     index_list=[]
#     counter=0
#     temp_list=[]
#     while counter<count:
#         temp_list.append(ind[counter])
#         counter=counter+1
#     ind=np.where(lis==0)[0]
#     counter=0
#     index_list.append(temp_list)
#     temp_list=[]
#     while counter<count:
#         temp_list.append(ind[counter])
#         counter=counter+1
#     index_list.append(temp_list)
    index_list=[0,1]
    return index_list

### This function is invoked and drives other functions 
### returns a dataframe containing the represenatative time series

In [13]:
def cluster(shape,ts_df,true_labels):
    no_of_class=max(true_labels)
    cluster_list=[]
    row_index=[]
    for k in range(0,no_of_class):
        D_org=ts_df[ts_df['0']==(k+1)]
        row=list(D_org.index)
        row_index.append(row)
        D_org=D_org.reset_index(drop=True)
        labels=D_org['0']
        D_org=D_org.drop(D_org.columns[0],axis=1)
        D=D_org.copy()
        shapelist=extractUShapelets(D,shape)
        S=np.array(list(shapelist))
        D2=D_org.copy()
        cluster=clusterData(D2,S,2,labels)
        cluster_list.append(cluster)
        print("clustering of class",(k+1),"complete")
    rep_df=representative_selection(cluster_list,row_index,ts_df)
    return rep_df