In [1]:
# _importing required libraries
import os
import collections

import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from scipy import stats 
import statistics

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# _initializing constant variables
input_file_path = os.getcwd() + f'/../data/output_csv/processed_data.csv'
col_names = ['timestamp (s)', 'activityID', 'X1', 'Y1', 'Z1', 'X2', 'Y2', 'Z2']

In [3]:
# _loading preprocessed data to main dataframe
main_df = pd.read_csv(input_file_path,names=col_names)
main_df

Unnamed: 0,timestamp (s),activityID,X1,Y1,Z1,X2,Y2,Z2
0,37.66,1,2.21530,8.27915,5.58753,-0.004750,0.037579,-0.011145
1,37.67,1,2.29196,7.67288,5.74467,-0.171710,0.025479,-0.009538
2,37.68,1,2.29090,7.14240,5.82342,-0.238241,0.011214,0.000831
3,37.69,1,2.21800,7.14365,5.89930,-0.192912,0.019053,0.013374
4,37.70,1,2.30106,7.25857,6.09259,-0.069961,-0.018328,0.004582
...,...,...,...,...,...,...,...,...
1564137,3409.07,7,-1.68428,-8.97338,3.43203,-0.231392,-0.391747,0.180935
1564138,3409.08,7,-1.72527,-9.04964,3.35469,-0.252115,-0.338597,0.180709
1564139,3409.09,7,-1.53312,-8.97455,3.43429,-0.176675,-0.311570,0.172539
1564140,3409.10,7,-1.50362,-9.01479,3.20395,-0.173602,-0.291495,0.170721


# Generating subsequences for each sequence of the data

In [4]:
# _initializing variables
window_length = 10
window_overlap = 5
max_window_index = len(main_df.index)
sequence_names = col_names[1:]
num_of_subsequences = len(sequence_names)
sub_sequences = [[] for x in range(num_of_subsequences)]

In [5]:
window_index = 0

while window_index <= (max_window_index - window_length):

    activity_sequence = main_df[sequence_names[0]][window_index:window_index+window_length].tolist()

    if len(set(activity_sequence)) == 1:
        sub_sequences[0].append(activity_sequence[0])
        
        for idx in range(1, num_of_subsequences):
            sub_sequences[idx].append(main_df[sequence_names[idx]][window_index:window_index+window_length].tolist())

    window_index += window_overlap

# _converting into numpy arrays
np_sequences = np.asarray(sub_sequences[1:])
print(np_sequences.shape)



(6, 312714, 10)


# Finding the statistics of the subsequences(mean,variance,skewness,IQR)


In [6]:
def subsequence_statistics(n):
    subsequences=np_sequences[n]
    Mean=[]
    Standard_deviation=[]
    Skewness=[]
    IQR=[]
    Min=[]
    Max=[]
    Median=[]
    Range=[]
    Lower_quartile=[]
    Middle_quartile=[]
    Upper_quartile=[]
    Coefficient_of_variation=[]
    Kurtosis=[]
    for i in range(0,len(subsequences)):
        
        mean=sum(subsequences[i])/len(subsequences[i])
        Mean.append(mean)
        
        std=statistics.stdev(subsequences[i])
        Standard_deviation.append(std)
        
        Cov=std/mean
        Coefficient_of_variation.append(Cov)
        
        minimum=min(subsequences[i])
        Min.append(minimum)
        
        maximum=max(subsequences[i])
        Max.append(maximum)
        
        range1=maximum-minimum
        Range.append(range1)
        
        skewness=stats.skew(subsequences[i])
        Skewness.append(skewness)
        
        median=statistics.median(subsequences[i])
        Median.append(median)
        
        q3,q2, q1 = np.percentile(subsequences[i], [75 ,50,25])
        
        Lower_quartile.append(q1)
        
        Middle_quartile.append(q2)
        
        Upper_quartile.append(q3)
        
        iqr = q3 - q1
        IQR.append(iqr)
        
        kurtosis=stats.kurtosis(subsequences[i])
        Kurtosis.append(kurtosis)
        
    data = list(zip(Mean,Standard_deviation,Skewness,IQR,Min,Max,Median,Range,Lower_quartile,Middle_quartile,Upper_quartile,Coefficient_of_variation,Kurtosis))
    statistic_feature_df = pd.DataFrame(data,columns=['Mean','Standard_deviation','Skewness','IQR','Min','Max','Median','Range','Lower_quartile','Middle_quartile','Upper_quartile','Coefficient_of_variation','Kurtosis'])    
    
    return statistic_feature_df

In [7]:
statistics_list = []
for idx in range(0, np_sequences.shape[0]):
    statistic_df_axis = subsequence_statistics(idx)
    statistics_list.append(statistic_df_axis)
    print(idx)
print(statistics_list)

0
1
2
3
4
5
[            Mean  Standard_deviation  Skewness       IQR      Min      Max  \
0       2.252053            0.096658 -0.336519  0.082810  2.07165  2.41148   
1       2.275643            0.115570 -0.274613  0.133798  2.07165  2.43617   
2       2.255907            0.091127  0.640422  0.057278  2.13466  2.43617   
3       2.209783            0.082043  0.898379  0.096855  2.11880  2.38759   
4       2.214889            0.094494  0.179849  0.106895  2.06309  2.38759   
...          ...                 ...       ...       ...      ...      ...   
312709 -1.183654            0.200320 -0.337586  0.277469 -1.52620 -0.92127   
312710 -1.225540            0.165837  0.742694  0.211905 -1.40156 -0.92127   
312711 -1.427101            0.140833 -0.946551  0.152780 -1.72474 -1.26271   
312712 -1.569999            0.118771  0.620479  0.112880 -1.72474 -1.34578   
312713 -1.639572            0.063257  0.472361  0.096680 -1.72527 -1.53312   

          Median    Range  Lower_quartile  Middle_

In [9]:
#assigning words for each cluster
def get_assigned_words(seq_clusters, cluster_words):
    
    # _assign word to each cluster of the subsequence usnig numpy where function
    assigned_words = np.where(seq_clusters != 0, seq_clusters, cluster_words[0])
    for idx in range(1, len(cluster_words)):
         assigned_words = np.where(seq_clusters != idx, assigned_words, cluster_words[idx])
    
    assigned_clusterWord = pd.DataFrame(data=assigned_words, columns=['cluster_word'])
            
    return assigned_clusterWord

In [10]:
#generating names for cluster count
def generate_cluster_names(sequence_names, cluster_cnt=100):
    
    words_dict = {}
    
    for seq in sequence_names:
        prefix = seq
        words_dict[seq] = [prefix+'_'+str(i) for i in range(cluster_cnt)]
        
    return words_dict

In [11]:
cluster_cnts = 100
words_dict = generate_cluster_names(sequence_names[1:], cluster_cnts)
#print(words_dict)
sequence_cluster_cnts = dict.fromkeys(words_dict, cluster_cnts)
#print(sequence_cluster_cnts)

In [14]:
def clustering(statistic_df, axis):
    
    model = KMeans(n_clusters=cluster_cnts)
    cluster_ids = pd.DataFrame(model.fit_predict(statistic_df), columns=['cluster ID'])
    cluster_words = words_dict[axis][:cluster_cnts]
    seq_clusters = cluster_ids.to_numpy()
    assigned_clusterWord = get_assigned_words(seq_clusters, cluster_words)

    centroids_of_clusters = pd.DataFrame(model.cluster_centers_[cluster_ids['cluster ID']], 
                     columns=['Mean_c','Standard_deviation_c','Skewness_c','IQR_c','Min_c','Max_c','Median_c','Range_c','Lower_quartile_c','Middle_quartile_c','Upper_quartile_c','Coefficient_of_variation_c','Kurtosis_c'])
    result = pd.concat([assigned_clusterWord, centroids_of_clusters], axis=1)  

    result = result.drop_duplicates()

    return result

In [15]:
def cluster_word_sort(axis_clusters,cluster_names):
    
    result = axis_clusters.loc[(axis_clusters['cluster_word'] == cluster_names)]
    
    return result.iloc[:, 1:]
    

In [16]:
clusters_centroid = []
centroid_statistic = []

for statistic_df, axis in zip(statistics_list, col_names[2:]):
    
    axis_clusters = clustering(statistic_df, axis)
    #print(axis_clusters)
    clusters_centroid.append(axis_clusters)
    cluster_names = words_dict[axis]
    for j in range(len(cluster_names)):
        cluster_stats = cluster_word_sort(axis_clusters,cluster_names[j])
        centroid_statistic.append(cluster_stats)

print(centroid_statistic)

[       Mean_c  Standard_deviation_c  Skewness_c     IQR_c     Min_c     Max_c  \
112 -3.353995              1.098578   -0.120716  1.486295 -5.035102 -1.809235   

     Median_c   Range_c  Lower_quartile_c  Middle_quartile_c  \
112 -3.305166  3.225867         -4.082083          -3.305166   

     Upper_quartile_c  Coefficient_of_variation_c  Kurtosis_c  
112         -2.595788                   -0.332224   -0.823226  ,           Mean_c  Standard_deviation_c  Skewness_c     IQR_c    Min_c  \
161991 -0.000354              1.215748   -0.335283  1.571633 -2.15788   

          Max_c  Median_c  Range_c  Lower_quartile_c  Middle_quartile_c  \
161991  1.51868  0.038884  3.67656         -0.634861           0.038884   

        Upper_quartile_c  Coefficient_of_variation_c  Kurtosis_c  
161991          0.936772                -3430.438859    -0.90877  ,         Mean_c  Standard_deviation_c  Skewness_c     IQR_c     Min_c  \
6194 -7.394775              1.122108   -0.110025  1.524818 -9.123793   



In [17]:
embeddings_filepath = os.getcwd() + f'/../data/sub_sequence_output/word_embeddings_from_clusters.txt'
pd.concat(centroid_statistic).to_csv(embeddings_filepath, index=False, header= False)