In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import pyplot as plt
import matplotlib
import seaborn as sns
import re
import os
from os import listdir
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings("ignore")
 

## Initialise the dataframe with the extracted features

This will obtain all extracted features from files in directory /processed/ts/ in one dataframe

init value determines if we will start with reading the dataframe<br>
'init value=1' read dataframe<br>
'init value=0' use the one in memory<br>

In [None]:
init=1 # Whether to read in the dataframe

if init==0:
    datapath=os.path.join('../data/processed/unlabeled/ts/')
    file_nr=int(0)
    total_files=len(listdir(datapath))
    nr_of_frames=int(0)
    df_full=pd.DataFrame()
    for file in listdir(datapath):
        if '.csv' in file:
            df=pd.read_csv(os.path.join(datapath,file))
            df['file']=file
            df['file_nr']=file_nr+1
            frame_index=df['frame'].to_numpy()
            frame_index+=nr_of_frames
            df['frame']=frame_index
            
            nr_of_frames+=len(df[df['file']==file_nr+1])
            file_nr+=1 # Counter
        df_full=pd.concat([df_full,df])

In [None]:
# Use this full dataframe for training

df_full.reset_index(drop=True,inplace=True)
df_train=df_full

df_train_features = df_train.drop(['file_nr','file','frame','Unnamed: 0'], axis=1)
df_train_features

### TSFRESH extracts all these features

In [None]:
list_of_patterns=[...]

#### We select only a few of them

In [None]:
list_of_features=[
    'variance_larger_than_standard_deviation',
 'median',
 'mean',
 'sum_values',
 'abs_energy',
 'acc_x_n_length',
 'standard_deviation',
 'mean_change',
 'variation_coefficient',
 'variance',   
 'maximum',
 'absolute_maximum',
 'minimum',  
 'linear_trend__attr_"slope"',
 'agg_linear_trend__attr_"slope"__chunk_len_50__f_agg_"mean"' 
 ]

#### Select the signals we will use for clustering

And here we also drop the columns of the unused features from the training data


In [None]:
complete_list_of_signals=['acc_x_n','acc_y_n','acc_z_n','acc_x_r','acc_y_r','acc_z_r','gyr_x_n','gyr_y_n','gyr_z_n','gyr_x_r','gyr_y_r','gyr_z_r','alpha_r','beta_r']
list_of_signals=['acc_x_n','acc_y_n','acc_z_n','acc_x_r','gyr_y_n','alpha_r','beta_r']
list_of_signals=complete_list_of_signals
drop_these_columns=[]
for column in df_train_features.columns: 
    for pattern in list_of_patterns:
        if pattern in column:
            if pattern not in list_of_features:
                drop_these_columns.append(column)

df_train_features.drop(drop_these_columns,inplace=True,axis=1)

drop_these_columns=[]
for column in df_train_features.columns: 
    for signal in complete_list_of_signals:
        if signal in column:
            if signal not in list_of_signals:
                drop_these_columns.append(column)


df_train_features.drop(drop_these_columns,inplace=True,axis=1)


In [None]:
df_train_features

### Scale the features for use in PCA

In [None]:
ss = StandardScaler()
df_train_features = pd.DataFrame(ss.fit_transform(df_train_features),columns = df_train_features.columns)

In [None]:
# Plot the remaining features
df_train_features

#### Make the labels

In [None]:
df_train_labels = df_train['frame']

## Dimension reduction

Determine how many dimensions will be used for the clusterng model, by defining n_components

In [None]:
n_components=8

In [None]:
# PCA method from sklearn
 
pca_columns=[]
pca = PCA(n_components=n_components)

for i in range(0,n_components):
    pca_columns.append(f'PCA{i+1}')

principal_components = pca.fit_transform(df_train_features)
pca_df = pd.DataFrame(data=principal_components, columns=pca_columns)
#pca_df.head(10)

### Determine ideal number of clusters (Elbow method)

Determine with the elbow method how many clusters would be ideal for this method

In [None]:
possible_k_values = range(3, 25)

sum_of_squared_distances = []

for k in possible_k_values:
    k_means = KMeans(n_clusters=k)
    k_means = k_means.fit(pca_df)

    sum_of_squared_distances.append(k_means.inertia_)

fig, ax = plt.subplots(figsize=(16, 6))

plt.plot(possible_k_values, sum_of_squared_distances, 'rx-')
plt.xlabel('k')
plt.ylabel('Sum of squared distances')

plt.title('Elbow plot for optimal number of clusters')
plt.show()

### Set the desired number of clusters

In [None]:
n_clusters=10

In [None]:
model = KMeans(n_clusters=n_clusters, max_iter=1).fit(pca_df)

In [None]:
visualization_kmeans= pd.concat([pca_df,pd.DataFrame({'cluster':model.labels_})],axis=1)
#visualization_kmeans.sample(100)

### PCA1 vs PCA2 Plot

In [None]:
plt.figure(figsize=(6,6))
palette= sns.color_palette()
ax=sns.scatterplot(x="PCA1",y="PCA2",hue="cluster",data=visualization_kmeans[['PCA1','PCA2','cluster']], palette=palette)
plt.title("Clustering using K-Means Algorithm")
plt.show()

In [None]:
cluster_frame_kmeans= pd.concat([df_train_labels,visualization_kmeans],axis=1) 

### Frames distribution

In [None]:
plt.figure(figsize=(5,5))
palette= sns.color_palette()
ax=sns.scatterplot(x="cluster",y="frame",data=cluster_frame_kmeans, palette=palette)
plt.title("Frames per cluster using K-Means Algorithm")
plt.show()

In [None]:
np.unique(model.labels_, return_counts=True)

In [None]:
plt.figure(figsize=(6,6))
palette= sns.color_palette()
ax=sns.scatterplot(x="PCA1",y="PCA2",hue="frame",data=cluster_frame_kmeans, palette=palette, legend=False)
plt.title("FRAMES distribution. PCA")
plt.show()

### Silhouette score

In [None]:
ss = metrics.silhouette_score(pca_df, model.labels_)  
print(f'Parameter: {n_clusters} clusters',  'Score: ', ss)

## Validation

Necessary functions to get frame date from the used files and plot the validation frames

In [None]:
# Function to extract data from a frame

def get_frame(df):

    acc_x_n=np.array(df['acc_x_n'])
    acc_y_n=np.array(df['acc_y_n'])
    acc_z_n=np.array(df['acc_z_n'])
    gyr_x_n=np.array(df['gyr_x_n'])
    gyr_y_n=np.array(df['gyr_y_n'])
    gyr_z_n=np.array(df['gyr_z_n'])

    alpha_r=np.array(df['alpha_r'])
    beta_r = np.array(df['beta_r'])

    return acc_x_n,acc_y_n,acc_z_n,gyr_x_n,gyr_y_n,gyr_z_n,alpha_r,beta_r


# Function to plot a graph of a frame
def plot_frame(acc_x_n,acc_y_n,acc_z_n,gyr_x_n,gyr_y_n,gyr_z_n,alpha_r,beta_r,cluster,i):
    fig,ax=plt.subplots(figsize=(16, 4))

    plt.title(f'Cluster {cluster} validation: frame {i+1}')
    ax.set_xlim(0,500)
    axb=ax.twinx()
    ax.plot(acc_x_n,color='tab:red',linewidth=.9 , label ='acc (x_n)', alpha=0.4)
    ax.plot(acc_y_n,color='tab:orange',linewidth=.9 , label ='acc (y_n)', alpha=0.4)
    ax.plot(acc_z_n,color='tab:cyan',linewidth=.9 , label ='acc (z_n)' , alpha=0.4)

    ax.plot(gyr_x_n,color='tab:red',linewidth=.9 , label ='gyr (x_n)', linestyle=':', alpha=0.66)
    ax.plot(gyr_y_n,color='tab:orange',linewidth=.9 , label ='gyr (y_n)', linestyle=':', alpha=0.66)
    ax.plot(gyr_z_n,color='tab:cyan',linewidth=.9 , label ='gyr (z_n)' , linestyle=':', alpha=0.66)

    axb.plot(alpha_r,color='tab:green',linewidth=1.6 , label ='alpha_r')
    axb.plot(beta_r,color='tab:blue',linewidth=1.6 , label ='beta_r')
    ax.legend(loc='upper left', bbox_to_anchor=(0, 1))
    axb.legend(loc='upper right', bbox_to_anchor=(1, 1))
    
    plt.show()

#### Visual validation method

In [None]:

# Function to plot graphs for frames in a certain cluster

def validate(validate_cluster,nr_frames):
 
    for i in range(nr_frames): 
        target_frames=cluster_frame_kmeans.index[cluster_frame_kmeans.cluster==validate_cluster]

        cluster_frame=i
        target_file=df_train[df_train.index==target_frames[cluster_frame]]['file'].values

        target_file=str(target_file[0])
        target_file=re.sub('ts_feat_','frames_',target_file)
        df_target_file=pd.read_csv(f'../data/processed/unlabeled/frames/{target_file}')

        df_target_frame=df_target_file[df_target_file['frame']==cluster_frame]

        df_target_frame

        acc_x_n,acc_y_n,acc_z_n,gyr_x_n,gyr_y_n,gyr_z_n,alpha_r,beta_r=get_frame(df_target_frame)
        plot_frame(acc_x_n,acc_y_n,acc_z_n,gyr_x_n,gyr_y_n,gyr_z_n,alpha_r,beta_r,validate_cluster,cluster_frame)


#### Validate on cluster number with (n) number of frames

In [None]:
validate(4,3)