In [68]:
import os
import sys
import ast
import numpy as np
import pandas as pd # use pandas for more functionality
from dtaidistance import dtw
from dask import delayed, compute
from dask.distributed import Client, default_client
# Close existing client if any
try:
    client = default_client()
    client.close()
except ValueError:
    pass
client = Client() # Start a new Dask client before importing modin pandas
# import modin.pandas as pd # use modin to speed things up (dont use modin wiht Dask)

In [69]:
# get working directory as paerent directory of current directory
cwd = os.getcwd()
pwd = os.path.dirname(cwd)

In [70]:
def create_binary_stream(row):
    # Unpack indices and stream length from the row
    indices, stream_length = row['event_timestamps'], row['event_length']

    # Initialize a NumPy array of zeros
    binary_stream = np.zeros(int(stream_length), dtype=int)

    if len(indices) != 0:

        # Ensure indices are integers
        indices = [int(i) for i in indices if isinstance(i, (int, float)) and not np.isnan(i)]

        # Convert indices to a NumPy array and filter out-of-bound indices
        indices = np.array(indices)
        valid_indices = indices[(0 <= indices) & (indices < int(stream_length))]

        # Set the specified indices to 1
        binary_stream[valid_indices] = 1

    return binary_stream

In [71]:
data_df = pd.read_csv(pwd + "/02_Clean_data/01_recording_event_times_labels_binary.csv")
# Convert string representations of lists to actual lists
data_df['event_timestamps'] = data_df['event_timestamps'].apply(ast.literal_eval)
# get the binary stream for each row
data_df['binary_stream'] = data_df.apply(create_binary_stream, axis=1)

data_df

Unnamed: 0,order,start_time,end_time,collection_key,subject,behavior_label,units,timestamps,event_timestamps,zero_index,event_length,binary_stream
0,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,2,[ 133 359 761 ... 42755165 42755...,"[133.0, 359.0, 761.0, 841.0, 1042.0, 1142.0, 1...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,26,[ 149 2407 2955 ... 42759906 42761...,"[149.0, 2407.0, 2955.0, 6394.0, 6924.0, 11713....",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,196,[ 394 1740 2021 ... 42760901 42761...,"[394.0, 1740.0, 2021.0, 2752.0, 3636.0, 5356.0...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,113,[ 436 7630 9762 ... 42492963 42528...,"[436.0, 7630.0, 9762.0, 11461.0, 13316.0, 1414...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,91,[ 449 4067 11824 ... 42749653 42753...,"[449.0, 4067.0, 11824.0, 22087.0]",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...
25593,exposure 3,1985700.0,1986200.0,20230818_133620_1_merged.rec,1.4,novel,73,[ 352534 1489090 3711465 3936724 4482099 ...,[],1965700.0,20500.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25594,exposure 3,1985700.0,1986200.0,20230818_133620_1_merged.rec,1.4,novel,141,[ 413994 636033 764028 1171529 1250682 ...,[],1965700.0,20500.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25595,exposure 3,1985700.0,1986200.0,20230818_133620_1_merged.rec,1.4,novel,36,[ 449953 451680 455769 458347 462015 ...,[],1965700.0,20500.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
25596,exposure 3,1985700.0,1986200.0,20230818_133620_1_merged.rec,1.4,novel,95,[ 472131 492784 527617 761281 788850 ...,[],1965700.0,20500.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [72]:
# dtw.distance(data_df['binary_stream'].iloc[0], data_df['binary_stream'].iloc[1])

In [73]:
# dtw.distance(data_df['event_timestamps'].iloc[0], data_df['event_timestamps'].iloc[1])

# Distance/Similarity Measurement of data

In [74]:
data_df_subset = data_df.iloc[0:5]

data_df_subset

Unnamed: 0,order,start_time,end_time,collection_key,subject,behavior_label,units,timestamps,event_timestamps,zero_index,event_length,binary_stream
0,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,2,[ 133 359 761 ... 42755165 42755...,"[133.0, 359.0, 761.0, 841.0, 1042.0, 1142.0, 1...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,26,[ 149 2407 2955 ... 42759906 42761...,"[149.0, 2407.0, 2955.0, 6394.0, 6924.0, 11713....",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,196,[ 394 1740 2021 ... 42760901 42761...,"[394.0, 1740.0, 2021.0, 2752.0, 3636.0, 5356.0...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,113,[ 436 7630 9762 ... 42492963 42528...,"[436.0, 7630.0, 9762.0, 11461.0, 13316.0, 1414...",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,,15167.0,24733.0,20230803_101331_1_merged.rec,1.1,acquisition,91,[ 449 4067 11824 ... 42749653 42753...,"[449.0, 4067.0, 11824.0, 22087.0]",0.0,24733.0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [75]:
@delayed
def calculate_dtw(i, j):
    distance = dtw.distance(data_df_subset['event_timestamps'].iloc[i], data_df_subset['event_timestamps'].iloc[j])
    return i, j, distance

pairs = [(i, j) for i in range(len(data_df_subset)) for j in range(i + 1, len(data_df_subset))]

delayed_results = [calculate_dtw(i, j) for i, j in pairs]

results = compute(*delayed_results)

dtw_matrix = np.full((len(data_df_subset), len(data_df_subset)), None, dtype=float)

for i, j, distance in results:
    dtw_matrix[i][j] = distance
    dtw_matrix[j][i] = distance

np.fill_diagonal(dtw_matrix, 0)

# Remember to close the client when done
client.close()

# dtw_matrix now contains all the pairwise DTW distances
pd.DataFrame(dtw_matrix)

Unnamed: 0,0,1,2,3,4
0,0.0,5673.218839,5412.589676,7928.569921,14879.734742
1,5673.218839,0.0,6279.836622,7396.888332,12351.419837
2,5412.589676,6279.836622,0.0,8654.888792,13529.655613
3,7928.569921,7396.888332,8654.888792,0.0,5342.726364
4,14879.734742,12351.419837,13529.655613,5342.726364,0.0


In [76]:
dtw.distance([0,0,0,0,0,0,0,0,0,0,0,0,0], 
             [1,1,1,1,1,1,1,1,1,1,1,1,1])

3.605551275463989

In [77]:
dtw.distance([1,0,1,1,1,1,1], 
             [0,1,0,1,0,1,0,1])

1.7320508075688772

In [78]:
data_df.iloc[0]['binary_stream']

array([0, 0, 0, ..., 0, 0, 0])

In [79]:
# Define the rescale function
def rescale_binary_array(arr, scale_factor):
    # Calculate the number of 0s between 1s
    zeros_between_ones = []
    zero_count = 0
    for value in arr:
        if value == 0:
            zero_count += 1
        elif value == 1:
            if zero_count > 0:
                zeros_between_ones.append(zero_count)
                zero_count = 0

    # Apply the scaling factor
    scaled_zeros = [max(1, round(z * scale_factor)) for z in zeros_between_ones]

    # Reconstruct the array
    new_arr = []
    for i, value in enumerate(arr):
        if value == 1:
            new_arr.append(1)
            if i < len(arr) - 1 and arr[i+1] == 0:
                zero_count = scaled_zeros.pop(0) if scaled_zeros else 0
                new_arr.extend([0] * zero_count)
    
    return new_arr



In [91]:
def rescale_binary_array_vectorized(arr, scale_factor):
    arr_np = np.array(arr, dtype=int)
    ones_indices = np.flatnonzero(arr_np)  # Indices of ones

    # Handle the case of an array with no ones
    if ones_indices.size == 0:
        scaled_length = max(1, round(len(arr) * scale_factor))
        return [0] * scaled_length

    # Calculate the number of zeros between ones
    zero_counts = np.diff(ones_indices) - 1

    # Scale the zero counts
    scaled_zero_counts = np.maximum(1, np.round(zero_counts * scale_factor)).astype(int)

    # Create arrays of zeros and concatenate them with ones
    zero_arrays = [np.zeros(count, dtype=int) for count in scaled_zero_counts]
    one_arrays = [np.array([1], dtype=int) for _ in range(len(zero_arrays) + 1)]  # +1 for the last one

    # Handle the case of leading zeros
    if ones_indices[0] != 0:
        leading_zeros_count = int(max(1, round(ones_indices[0] * scale_factor)))
        zero_arrays.insert(0, np.zeros(leading_zeros_count, dtype=int))

    # Handle trailing zeros
    trailing_zeros_count = len(arr) - ones_indices[-1] - 1
    if trailing_zeros_count > 0:
        scaled_trailing_zeros_count = int(max(1, round(trailing_zeros_count * scale_factor)))
        zero_arrays.append(np.zeros(scaled_trailing_zeros_count, dtype=int))
    else:
        zero_arrays.append(np.array([], dtype=int))  # Append empty array if no trailing zeros

    # Combine arrays of zeros and ones
    new_arr = np.hstack([val for pair in zip(one_arrays, zero_arrays) for val in pair])

    return new_arr.tolist()

In [81]:
# Scale factor
# smaller = more compressed
scale_factor = 0.1

# Apply the function to each element of the column
data_df['binary_stream_reduced'] = data_df.loc[0:5000]['binary_stream'].apply(lambda x: rescale_binary_array(x, scale_factor))

In [95]:
# Scale factor
# smaller = more compressed
scale_factor = 0.1

# Apply the function to each element of the column
data_df['binary_stream_reduced'] = data_df.loc[0:5000]['binary_stream'].apply(lambda x: rescale_binary_array_vectorized(x, scale_factor))

In [94]:
# Example DataFrame
df = pd.DataFrame({
    'binary_arrays': [[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
})

# Scale factor
scale_factor = 0.4

# Apply the function to each element of the column
df['scaled_arrays'] = df['binary_arrays'].apply(lambda x: rescale_binary_array(x, scale_factor))

print(df)

                                      binary_arrays  \
0              [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]   
1                          [1, 0, 0, 0, 0, 0, 0, 1]   
2  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]   

                 scaled_arrays  
0        [1, 0, 0, 1, 0, 0, 1]  
1                 [1, 0, 0, 1]  
2  [1, 0, 0, 0, 0, 1, 0, 0, 1]  


In [93]:
# Example DataFrame
df = pd.DataFrame({
    'binary_arrays': [[1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 1], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]]
})

# Scale factor
scale_factor = 0.4

# Apply the function to each element of the column
df['scaled_arrays'] = df['binary_arrays'].apply(lambda x: rescale_binary_array_vectorized(x, scale_factor))

print(df)

                                      binary_arrays  \
0              [1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]   
1                          [1, 0, 0, 0, 0, 0, 0, 1]   
2  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1]   

                 scaled_arrays  
0        [1, 0, 0, 1, 0, 0, 1]  
1                 [1, 0, 0, 1]  
2  [1, 0, 0, 0, 0, 1, 0, 0, 1]  
