## Open-field backward locomotion analysis (DLC-based)

This notebook analyzes DeepLabCut tracking data to classify locomotion direction (forward, backward, leftward, rightward) on a frame-by-frame basis.

Movement direction is computed from the angle between the movement vector (Back displacement) and the body-axis vector (Tailbaseâ†’Back), with a speed threshold applied to exclude low-motion frames.

Peri-stimulus movement proportions are calculated in fixed time bins and used for linear mixed-effects modeling across mice and trials.

Expected input location: data/raw/     
##  .h5 tracking files and stimulus timestamp files are required


In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import statsmodels.formula.api as smf

# Set the directory where your DLC H5 data files are located
data_directory = r'data/raw/'

# Parameters
sampling_rate = 30  # frames per second
bin_size_frames = 6  # Number of frames per bin (200 ms per bin)
pre_stim_frames = 30   # Frames before stimulus to analyze (1 second)
post_stim_frames = 120  # Frames after stimulus to analyze (4 seconds)

# Convert frames to time (seconds)
bin_size_time = bin_size_frames / sampling_rate  # Bin size in seconds
pre_stim_time = pre_stim_frames / sampling_rate  # Pre-stimulus time in seconds
post_stim_time = post_stim_frames / sampling_rate  # Post-stimulus time in seconds

# Angle thresholds
forward_threshold = 60
backward_threshold = 120

# Speed threshold
speed_threshold = 0.2  # Adjust as appropriate

# List to store data from all animals
all_data = []

# Loop over animals (MouseID from 1 to 8)
for mouse_id in range(1, 9):
    # Define file paths
    h5_file = os.path.join(data_directory, f'{mouse_id}.h5')
    stim_file = os.path.join(data_directory, f'stim_{mouse_id}.csv')
    
    # Check if files exist
    if not os.path.isfile(h5_file) or not os.path.isfile(stim_file):
        print(f'Files for MouseID {mouse_id} not found.')
        continue
    
    print(f'Processing data for MouseID {mouse_id}...')
    
    # Load the H5 file
    try:
        data = pd.read_hdf(h5_file)
    except Exception as e:
        print(f'Error loading {h5_file}: {e}')
        continue
    
    # Load the stimulus times
    try:
        stimulus_times = pd.read_csv(stim_file)
        stimulus_frames = stimulus_times.iloc[:, 0].values
    except Exception as e:
        print(f'Error loading {stim_file}: {e}')
        continue
    
    # Extract coordinates for head, back, and tailbase
    # Adjust the keys according to your data structure
    try:
        scorer = data.columns.levels[0][0]
        back_x = data[scorer]['Back']['x'].values
        back_y = data[scorer]['Back']['y'].values
        tailbase_x = data[scorer]['Tailbase']['x'].values
        tailbase_y = data[scorer]['Tailbase']['y'].values
    except KeyError as e:
        print(f'Error accessing body part coordinates for MouseID {mouse_id}: {e}')
        continue
    
    # Apply smoothing to positional data (e.g., moving average filter)
    window_size = 6  # Adjust window size as needed
    def moving_average(a, n=window_size):
        return np.convolve(a, np.ones(n)/n, mode='same')
    
    # Smooth the positional data
    back_x_smooth = moving_average(back_x)
    back_y_smooth = moving_average(back_y)
    tailbase_x_smooth = moving_average(tailbase_x)
    tailbase_y_smooth = moving_average(tailbase_y)
    
    # Calculate movement vectors using the smoothed 'Back' point
    mv_x = back_x_smooth[1:] - back_x_smooth[:-1]
    mv_y = back_y_smooth[1:] - back_y_smooth[:-1]
    movement_vectors = np.stack((mv_x, mv_y), axis=-1)
    
    # Calculate movement magnitude (speed)
    movement_magnitude = np.linalg.norm(movement_vectors, axis=1)
    
    # Set a speed threshold to exclude insignificant movements
    significant_movement = movement_magnitude >= speed_threshold
    
    # Calculate orientation vectors from 'Tailbase' to 'Back' using smoothed data
    ov_x = back_x_smooth[:-1] - tailbase_x_smooth[:-1]
    ov_y = back_y_smooth[:-1] - tailbase_y_smooth[:-1]
    orientation_vectors = np.stack((ov_x, ov_y), axis=-1)
    
    # Calculate dot products
    dot_products = np.einsum('ij,ij->i', movement_vectors, orientation_vectors)
    
    # Calculate the angle between movement and orientation vectors
    mv_norm = np.linalg.norm(movement_vectors, axis=1)
    ov_norm = np.linalg.norm(orientation_vectors, axis=1)
    
    # Avoid division by zero
    mv_norm[mv_norm == 0] = np.nan
    ov_norm[ov_norm == 0] = np.nan
    
    cos_theta = dot_products / (mv_norm * ov_norm)
    # Ensure cos_theta values are within valid range
    cos_theta = np.clip(cos_theta, -1.0, 1.0)
    angles_rad = np.arccos(cos_theta)  # Angle in radians
    angles_deg = np.degrees(angles_rad)  # Angle in degrees
    
    # Calculate cross product to determine left vs. right
    # Since we are in 2D, we'll compute the z-component of the cross product manually
    cross_products_z = movement_vectors[:, 0] * orientation_vectors[:, 1] - movement_vectors[:, 1] * orientation_vectors[:, 0]
    
    # Define movement categories
    # Backward movement
    backward_movement = (angles_deg > backward_threshold) & significant_movement
    
    # Forward movement
    forward_movement = (angles_deg < forward_threshold) & significant_movement
    
    # Sideways movement (angles between 60 and 120 degrees)
    sideways_movement = (angles_deg >= forward_threshold) & (angles_deg <= backward_threshold) & significant_movement
    
    # Leftward movement
    leftward_movement = sideways_movement & (cross_products_z > 0)
    
    # Rightward movement
    rightward_movement = sideways_movement & (cross_products_z < 0)
    
    # Initialize bins for PSTH
    num_bins = int(np.ceil((pre_stim_frames + post_stim_frames) / bin_size_frames))
    bin_edges_frames = np.arange(-pre_stim_frames, post_stim_frames + 1, bin_size_frames)  # Define edges explicitly
    bin_edges_time = bin_edges_frames / sampling_rate  # Convert bin edges to seconds
    time_points = bin_edges_time[1:]  # Time points now represent the end of each bin

    
    # For each stimulus, calculate movement proportions around it
    for trial_id, stim_frame in enumerate(stimulus_frames):
        # Determine frame range for peri-stimulus period
        start_frame = stim_frame - pre_stim_frames
        end_frame = stim_frame + post_stim_frames
        # Adjust for edges
        peri_start = max(start_frame, 0)
        peri_end = min(end_frame, len(backward_movement))
        # Extract peri-stimulus movement data
        time_indices = np.arange(peri_start, peri_end)
        times = (time_indices - stim_frame) / sampling_rate  # Time relative to stimulus
        # Ensure times and movement data align
        backward_data = backward_movement[peri_start:peri_end]
        forward_data = forward_movement[peri_start:peri_end]
        leftward_data = leftward_movement[peri_start:peri_end]
        rightward_data = rightward_movement[peri_start:peri_end]
        times = times[:len(backward_data)]
        
        # For each time bin, calculate movement proportions
        for bin_idx in range(num_bins):
            bin_start_time = -pre_stim_time + bin_idx * bin_size_time
            bin_end_time = bin_start_time + bin_size_time
            # Indices of times within the current bin
            bin_indices = (times >= bin_start_time) & (times < bin_end_time)
            if np.any(bin_indices):
                # Calculate proportions for each movement type
                proportion_backward = np.nanmean(backward_data[bin_indices])
                proportion_forward = np.nanmean(forward_data[bin_indices])
                proportion_leftward = np.nanmean(leftward_data[bin_indices])
                proportion_rightward = np.nanmean(rightward_data[bin_indices])
                # Create DataFrame rows for each movement type
                for movement_type, proportion in zip(
                    ['Backward', 'Forward', 'Leftward', 'Rightward'],
                    [proportion_backward, proportion_forward, proportion_leftward, proportion_rightward]):
                    data_row = {
                        'MouseID': mouse_id,
                        'TrialID': trial_id,
                        'Time': bin_end_time,  # end of the bin
                        'Proportion': proportion,
                        'MovementType': movement_type
                    }
                    all_data.append(data_row)
            else:
                # No data in this bin (could happen at the edges)
                continue
    
    print(f'Finished processing MouseID {mouse_id}.')
    # Check if data was collected for this mouse
    mouse_data = [d for d in all_data if d['MouseID'] == mouse_id]
    if not mouse_data:
        print(f'No data collected for MouseID {mouse_id}.')
    else:
        print(f'Data collected for MouseID {mouse_id}, number of observations: {len(mouse_data)}')

print('Data processing complete.')

# Create a DataFrame from the list of dictionaries
df_all = pd.DataFrame(all_data)

# Check which MouseIDs are included
print('MouseIDs included in df_all:', df_all['MouseID'].unique())

# Data preprocessing
df_all['Time'] = df_all['Time'].astype(float)
df_all['MouseID'] = df_all['MouseID'].astype('category')
df_all['TrialID'] = df_all['TrialID'].astype('category')
df_all['MovementType'] = df_all['MovementType'].astype('category')

# Drop rows with NaN values in 'Proportion'
df_all = df_all.dropna(subset=['Proportion'])

# Create Time_bin with individual time bins (for plotting)
df_all['Time_bin'] = df_all['Time'].apply(lambda x: f'{x:.1f}')

# Create Time_bin_LMM for the LMM analysis (combine pre-stimulus bins)
df_all['Time_bin_LMM'] = df_all['Time'].apply(lambda x: 'PreStimulus' if x < 0 else f'Post_{x:.1f}')

# For the LMM, define categories with 'PreStimulus' as the reference
unique_time_bins_LMM = df_all['Time_bin_LMM'].unique()
time_bins_without_prestim = [tb for tb in unique_time_bins_LMM if tb != 'PreStimulus']
categories_LMM = ['PreStimulus'] + sorted(time_bins_without_prestim, key=lambda x: float(x.replace('Post_', '')))

df_all['Time_bin_LMM'] = pd.Categorical(df_all['Time_bin_LMM'], categories=categories_LMM, ordered=True)

# For plotting, convert Time_bin to numeric values
df_all['Time_numeric'] = df_all['Time']

# Fit the LMM using Time_bin_LMM and MovementType with interaction
print('Fitting Linear Mixed-Effects Model...')
model = smf.mixedlm("Proportion ~ Time_bin_LMM * MovementType", df_all, groups=df_all["MouseID"])
result = model.fit()

print(result.summary())

# Add predicted values to df_all
df_all['Predicted'] = result.predict()

# Aggregate data for plotting (using individual time bins)
agg_data = df_all.groupby(['Time', 'MovementType']).agg(
    Mean_Proportion=('Proportion', 'mean'),
    SEM_Proportion=('Proportion', 'sem'),
    Mean_Predicted=('Predicted', 'mean')
).reset_index()

# Sort the DataFrame by Time for proper plotting
agg_data = agg_data.sort_values('Time')

# Save the aggregated data to a CSV file
output_csv = os.path.join(data_directory, 'aggregated_data_movement_types.csv')
agg_data.to_csv(output_csv, index=False)
print(f'Aggregated data saved to {output_csv}')

# Plot Mean Proportion with SEM Shaded Area for each MovementType
plt.figure(figsize=(12, 8))
movement_types = df_all['MovementType'].unique()
colors = {'Backward': 'blue', 'Forward': 'green', 'Leftward': 'orange', 'Rightward': 'purple'}

for movement_type in movement_types:
    data_subset = agg_data[agg_data['MovementType'] == movement_type]
    plt.plot(data_subset['Time'], data_subset['Mean_Proportion'], label=f'{movement_type}', color=colors[movement_type])
    plt.fill_between(data_subset['Time'],
                     data_subset['Mean_Proportion'] - data_subset['SEM_Proportion'],
                     data_subset['Mean_Proportion'] + data_subset['SEM_Proportion'],
                     color=colors[movement_type], alpha=0.2)
plt.axvline(0, color='k', linestyle='--', label='Stimulus Onset')
plt.xlabel('Time from Stimulus (s)')
plt.ylabel('Proportion of Movement')
plt.title('Mean Proportion of Movement Types Over Time with SEM')
plt.legend()
plt.tight_layout()
plt.show()
