### Imports and setup

In [None]:
# imports
import pickle
import pandas as pd
import time

# setup for logging
import logging
from datetime import datetime

# write logs with time to log folder
LOG_FILENAME = datetime.now().strftime('/home/wgrambozambo/log/logfile_%H_%M_%S_%d_%m_%Y.log')

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG)


# open file
with open('frames.pkl', 'rb') as f:
    frames = pickle.load(f)

f.close()

### Clustering on slot level

In [None]:
# hierarchical clustering for looking at data
import fastcluster
import time
import numpy as np
from scipy.cluster.hierarchy import cut_tree

# range of time slots
ranger = list(range(0, 312))

# function that executes clustering and cutree per array from file and outputs the list of cutrees
def solution(lizt):
    output = []
    for item in lizt:
        # load array by time slot
        array = np.load('/home/wgrambozambo/arrays1/array'+ str(item)+'.npy')

        # cluster
        X_clustered = fastcluster.linkage(array, method='centroid', metric='cosine')

        cutoff = 3.5/len(array)
        cutoff = (round(cutoff,5))

        # cut tree
        cutree = cut_tree(X_clustered, height=cutoff)

        # add to output list
        output.append(cutree)
    return output

# start timer
t0 = time.time()

# execute
clusters = solution(ranger)

# record the time
t1 = time.time()
print("Time: {}".format(t1-t0))

logging.info('Clustering success')

### Tag the dataframe with cluster ids

In [None]:
# adds ids in clusterid column
def clustertagger(lizt1, lizt2):
    output = []
    for f, b in zip(lizt1, lizt2):
        l1 = b.tolist()
        my_list = [item for sublist in l1 for item in sublist]
        f['clusterid'] = pd.Series(my_list).values
        output.append(f)
    return output

# apply
result = clustertagger(frames, clusters)

# make df
data_clustered = pd.concat(result)

# file dump
with open('data_clustered.pkl', 'wb') as f:
    pickle.dump(data_clustered, f)
    
f.close()

logging.info('Dataframe with clusterids saved as data_clustered.pkl')