## Import necessary Python packages

In [1]:
import os
import pickle
import multirecording_spikeanalysis as spike
import numpy as np
# import pandas as pd # use pandas for more functionality
import modin.pandas as pd # use modin to speed things up

## Define working directory relative to repository

In [2]:
# get working directory as paerent directory of current directory
cwd = os.getcwd()
pwd = os.path.dirname(cwd)

## Import .pkl files

In [3]:
# import pickle files given by lab
with open(pwd + '/01_Raw_data/phase2_collection.pkl', 'rb') as f:
    phase2 = pickle.load(f)

# Not using phase 3 data
# with open(pwd + '/01_Raw_data/phase3_collection.pkl', 'rb') as f:
#     phase3 = pickle.load(f)

## Notes and reminders about data

In [4]:
# show the subject to which each recording belongs to
# .subject

# shows the type of unit/neuron (we only use good units in unit_timestamps)
# .labels_dict

# all the timestamps for all units
# .timestamps_var

# all the timestamps for each unit
# .unit_timestamps 

# the behaviour labels for each timestamp with a starting and ending time
# .event_dict

# Rework Data from .pkl files

### Behavioral labels and event ranges dataframe

In [5]:
# create metadata dataframe
metadata_df = pd.DataFrame() # create empty dataframe
temp_df_lst = [] # create empty list to store dataframes
for i, j in phase2.collection.items(): # loop through each recording
    temp_df = pd.DataFrame() # create empty dataframe
    start_time_lst = [] # create empty list to store start times
    end_time_lst = [] # create empty list to store end times
    behavior_lab_lst = [] # create empty list to store behavior labels
    for k, v in j.event_dict.items(): # loop through each behavior
        start_time_lst += list(v[:,0]) # add start times to list
        end_time_lst += list(v[:,1]) # add end times to list
        behavior_lab_lst += list([k] * len(v)) # add behavior labels to list
    temp_df['behavior_label'] = behavior_lab_lst # add behavior labels to dataframe
    temp_df['start_time'] = start_time_lst # add start times to dataframe
    temp_df['end_time'] = end_time_lst # add end times to dataframe
    temp_df['collection_key'] = i # add recording name to dataframe
    temp_df['subject'] = j.subject # add subject to dataframe
    temp_df_lst.append(temp_df) # add dataframe to list
metadata_df = pd.concat(temp_df_lst) # concatenate all dataframes in list to one dataframe


    from distributed import Client

    client = Client()

Perhaps you already have a cluster running?
Hosting the HTTP server on port 55937 instead
Please refer to https://modin.readthedocs.io/en/stable/supported_apis/defaulting_to_pandas.html for explanation.


In [6]:
# split the order labels from the behaviour labels to a new column
# Get a dataframe that only includes the behaviour labels
order_df = metadata_df[metadata_df['behavior_label'].isin([
    'exposure 1',
    'exposure 2', 
    'exposure 3'])].reset_index(drop=True)
order_df = order_df.rename(columns={'behavior_label': 'order'}) # change order label column name

# merge dataframes on all columns except the order and behaviour columns
merged_order_df = pd.merge(
    order_df, 
    metadata_df, 
    on=[
        'start_time',
        'end_time',
        'collection_key',
        'subject'],
    how='right')

metadata_df = merged_order_df # rename dataframe



### Units dataframe & unit timestamps list
Save a dataframe of the units used for each subject at each recording session.
Extract the timestamps for each event for each unit from the data.
Merge metadata dataframes into a single long dataframe. So that metadata can be easily related to each recording session.

In [7]:
# create metadata dataframe
data_df = pd.DataFrame()
temp_df_lst = []
for i, j in phase2.collection.items():
    temp_df = pd.DataFrame()
    temp_df['units'] = j.unit_timestamps.keys() # add neurons/units to dataframe
    temp_df['timestamps'] = j.unit_timestamps.values() # add neurons/units to dataframe
    temp_df['collection_key'] = i # add recording name to dataframe
    temp_df['subject'] = j.subject # add subject to dataframe
    temp_df_lst.append(temp_df) # add dataframe to list
data_df = pd.concat(temp_df_lst) # concatenate all dataframes in list to one dataframe



In [8]:
# Merge dataframes
merged_data_df = pd.merge(metadata_df, data_df, on=['collection_key', 'subject'])

In [9]:
# Control home much before and after event to include with event.
BEFORE_EVENT_BUFFER = 1
AFTER_EVENT_BUFFER = 0
BEFORE_EVENT_BUFFER = BEFORE_EVENT_BUFFER*20000
AFTER_EVENT_BUFFER = AFTER_EVENT_BUFFER*20000

In [10]:
# get list of timestamps for each unit
timestamps_lst = merged_data_df['timestamps'].tolist()
# Find the length of the longest array
max_length = max(arr.size for arr in timestamps_lst)
# make a nan array with the same max shape as the array of timestamps
timestamps_array = np.full((40866, max_length), np.nan)
# Fill in the nan_filled_array with values from timestamps
for i, arr in enumerate(timestamps_lst):
    timestamps_array[i, :arr.size] = arr



In [11]:
# get the start and end times for each behavior into arrays
min_thresholds = np.array(merged_data_df['start_time']) - BEFORE_EVENT_BUFFER # subtract 20 000 for one extra second before event
max_thresholds = np.array(merged_data_df['end_time']) + AFTER_EVENT_BUFFER
# Reshape the threshold arrays to column vectors for broadcasting
min_thresholds = min_thresholds[:, np.newaxis]
max_thresholds = max_thresholds[:, np.newaxis]
# Apply thresholds using broadcasting
lower_mask = timestamps_array < min_thresholds
upper_mask = timestamps_array > max_thresholds
# Replace values that are either too low or too high with NaN
timestamps_array[lower_mask | upper_mask] = np.nan

In [12]:
# import pandas as pd # use pandas if modin doesn't work (should be done automatically)
# remove nan values from arrays in list
event_ts_lst = list(timestamps_array) # convert array to list
event_ts_lst = [arr[~np.isnan(arr)] for arr in event_ts_lst] # remove nan values from arrays in list
merged_data_df['event_timestamps'] = event_ts_lst # add event timestamps to dataframe
# remove all duplicate rows
merged_data_df = merged_data_df.drop_duplicates().reset_index(drop=True)
# drop all rwos that have exposure 1, 2, or 3 as the behaviour label
merged_data_df = merged_data_df[~merged_data_df['behavior_label'].isin([
    'exposure 1',
    'exposure 2', 
    'exposure 3'])].reset_index(drop=True)
# Convert each float in the lists to an integer
merged_data_df['event_timestamps'] = merged_data_df['event_timestamps'].apply(lambda lst: [int(x) for x in lst])

In [15]:
# save dataframe as csv file
merged_data_df.to_csv(pwd + "/02_Clean_data/00_recording_event_times_labels.csv", index=False)

