In [None]:
import os
import ast
import math
import numpy as np
import pandas as pd # use pandas for more functionality
from dtaidistance import dtw

# # if dtaidistance does not work with C, use dask for parallelization
# from dask import delayed, compute
# from dask.distributed import Client, default_client
# # Close existing client if any
# try:
#     client = default_client()
#     client.close()
# except ValueError:
#     pass
# client = Client() # Start a new Dask client before importing modin pandas
# import modin.pandas as pd # use modin to speed things up (dont use modin wiht Dask)

In [None]:
# get working directory as paerent directory of current directory
cwd = os.getcwd()
pwd = os.path.dirname(cwd)

In [None]:
def create_binary_stream(row):
    # Unpack indices and stream length from the row
    indices, stream_length = row['event_timestamps'], row['event_length']

    # Initialize a NumPy array of zeros
    binary_stream = np.zeros(int(stream_length), dtype=int)

    if len(indices) != 0:

        # Ensure indices are integers
        indices = [int(i) for i in indices if isinstance(i, (int, float)) and not np.isnan(i)]

        # Convert indices to a NumPy array and filter out-of-bound indices
        indices = np.array(indices)
        valid_indices = indices[(0 <= indices) & (indices < int(stream_length))]

        # Set the specified indices to 1
        binary_stream[valid_indices] = 1

    return binary_stream

In [None]:
def scale_zeros(binary_vector, scaling_factor=10):
    if scaling_factor <= 0:
        raise ValueError("Scaling factor must be greater than 0")

    scaled_vector = []
    zero_count = 0

    for bit in binary_vector:
        if bit == 1:
            if zero_count > 0:
                # Scale the number of zeros and add them to the new vector
                scaled_count = max(1, int(math.ceil(zero_count / scaling_factor)))
                scaled_vector.extend([0] * scaled_count)
                zero_count = 0
            scaled_vector.append(1)
        else:
            zero_count += 1

    # Handle trailing zeros
    if zero_count > 0:
        scaled_count = max(1, int(math.ceil(zero_count / scaling_factor)))
        scaled_vector.extend([0] * scaled_count)

    # Convert the list to a NumPy array with a double type
    return np.array(scaled_vector, dtype=np.double)

In [None]:
# import data
data_df = pd.read_csv(pwd + "/02_Clean_data/01_recording_event_times_labels_binary.csv")

# Convert string representations of lists to actual lists
data_df['event_timestamps'] = data_df['event_timestamps'].apply(ast.literal_eval)
# get the binary stream for each row
data_df['binary_stream'] = data_df.apply(create_binary_stream, axis=1)

# create compressed binary representation
# Scale factor
# smaller = more compressed
scale_factor = 100
# Apply the function to each element of the column
data_df['scaled_arrays'] = data_df['binary_stream'].apply(lambda x: scale_zeros(x, scale_factor))

data_df

# Distance/Similarity Measurement of data

In [None]:
# get a small subset to calcuylate the distance matrix from
scaled_timeseries_lst = data_df['scaled_arrays'].tolist()

# calculate the distance matrix
# function docs: https://dtaidistance.readthedocs.io/en/latest/modules/dtw.html?highlight=parallel#dtaidistance.dtw.distance_matrix_fast
dtw_distance_matrix = dtw.distance_matrix_fast(scaled_timeseries_lst)

In [None]:
# save distance matrix for further clustering analysis
# Save to .npy file
np.save(pwd + "/02_Clean_data/02_dtw_distance_matrix.npy", dtw_distance_matrix)

In [None]:
# use this if dtaidistance does not work with C

# @delayed
# def calculate_dtw(i, j):
#     distance = dtw.distance(data_df_subset['scaled_arrays'].iloc[i], data_df_subset['scaled_arrays'].iloc[j])
#     return i, j, distance

# pairs = [(i, j) for i in range(len(data_df_subset)) for j in range(i + 1, len(data_df_subset))]

# delayed_results = [calculate_dtw(i, j) for i, j in pairs]

# results = compute(*delayed_results)

# dtw_matrix = np.full((len(data_df_subset), len(data_df_subset)), None, dtype=float)

# for i, j, distance in results:
#     dtw_matrix[i][j] = distance
#     dtw_matrix[j][i] = distance

# np.fill_diagonal(dtw_matrix, 0)

# # Remember to close the client when done
# client.close()

# # dtw_matrix now contains all the pairwise DTW distances
# pd.DataFrame(dtw_matrix)