In [47]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import pickle
from scipy.stats import kstest
from collections import Counter
from scipy.stats import sem 
import pingouin as pg
import statsmodels.formula.api as smf
from statsmodels.stats.multitest import multipletests
import seaborn as sns
from sklearn.preprocessing import StandardScaler

In [None]:
'''
First, add 'med' to the cell dict for each cell. 
For each cell we need to extract the 'med' value, which is the centre of the cell ROI (y,x).
For each cell pair, use the y,x values to compute a distance using d = √((x2 - x1)² + (y2 - y1)²) 
'''

In [2]:
# INPUTS:

filepath = "F:/Two-Photon/Psilocybin Project/Evoked Cohort Mice/compiled_dicts"
z_thresh = 4

In [3]:
# Dictionary to map filenames to variable names
file_variable_mapping = {
    'saline_pre_dict.pkl': 'saline_pre',
    'saline_post_dict.pkl': 'saline_post',
    'psilo_pre_dict.pkl': 'psilo_pre',
    'psilo_post_dict.pkl': 'psilo_post'
}

# Initialize empty dictionaries
saline_pre = {}
saline_post = {}
psilo_pre = {}
psilo_post = {}

# Iterate through files in megadict folder
for filename in os.listdir(filepath):
    if filename in file_variable_mapping:
        file_path = os.path.join(filepath, filename)
        with open(file_path, 'rb') as file:
            # Load pkl file and assign to respective dictionary variable
            globals()[file_variable_mapping[filename]] = pickle.load(file)

In [59]:
def prepare_cell_for_df(traces,freqs,ints,trials):
    # Extract the deconvolved response traces for a single cell into a single array
    cell_data = np.array([[[traces[i][j][k] for k in trials] for j in ints] for i in freqs])

    # Baseline subtract each trial, subtracting the mean of the first five frames from the mean of the response frames.
    baseline = cell_data[:,:,:,:5].mean(axis=-1)  # Mean of first 5 frames
    response = cell_data[:,:,:,5:].mean(axis=-1)  # Mean of response frames
    cell_data_baselined = response - baseline

    # Flatten freq×intensity conditions
    cell_data_flatten = cell_data_baselined.reshape(-1, 1)

    scaler = StandardScaler()
    scaled_trials = scaler.fit_transform(cell_data_flatten).flatten()

    return scaled_trials

In [66]:
def dict_to_pd(dict,condition):
    rows = []

    first_mouse = next(iter(dict))
    first_cell = next(iter(dict[first_mouse]))
    traces = dict[first_mouse][first_cell]['traces']
    freqs = list(traces.keys())
    ints = list(traces[freqs[0]].keys())
    trials  = list(traces[freqs[0]][ints[0]].keys())

    print(freqs, ints, trials, sep='\n')

    # Create trial column names
    trial_columns = [f'Trial_{i+1}' for i in range(len(freqs) * len(ints) * len(trials))]

    for mouse_id, mouse_data in dict.items():

        base_mouse_id = mouse_id[-3:]

        for cell_id, cell_data in mouse_data.items():

            if cell_data['active'] == True:
                unique_cell_id = f"{mouse_id}_{condition}_{cell_id}"
                row = {
                    'original_mouse_id': mouse_id,
                    'mouse_id': base_mouse_id,
                    'cell': cell_id,
                    'condition': condition,
                    'unique_cell_id': unique_cell_id,
                    'coords': cell_data['coords']
                }

                scaled_trials = prepare_cell_for_df(cell_data['deconvolved_traces'],freqs,ints,trials)

                for i, trial_value in enumerate(scaled_trials):
                    row[trial_columns[i]] = trial_value


                rows.append(row)
    df = pd.DataFrame(rows)

    return df

In [67]:
saline_pre_data = dict_to_pd(saline_pre,'Pre')

[4364, 5371, 6612, 8140, 10020, 12335, 15184, 18691, 23009, 28324, 34867, 42922]
[35, 50, 65, 80]
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [6]:
saline_pre_data

Unnamed: 0,original_mouse_id,mouse_id,cell,condition,unique_cell_id,coords
0,saline_1_186,186,4,Pre,saline_1_186_Pre_4,"[187, 207]"
1,saline_1_186,186,6,Pre,saline_1_186_Pre_6,"[263, 75]"
2,saline_1_186,186,12,Pre,saline_1_186_Pre_12,"[187, 483]"
3,saline_1_186,186,14,Pre,saline_1_186_Pre_14,"[183, 279]"
4,saline_1_186,186,15,Pre,saline_1_186_Pre_15,"[183, 311]"
...,...,...,...,...,...,...
1303,saline_1_474,474,1148,Pre,saline_1_474_Pre_1148,"[121, 289]"
1304,saline_1_474,474,1199,Pre,saline_1_474_Pre_1199,"[199, 405]"
1305,saline_1_474,474,1217,Pre,saline_1_474_Pre_1217,"[123, 365]"
1306,saline_1_474,474,1497,Pre,saline_1_474_Pre_1497,"[256, 498]"



np.linalg.norm is calculating the Euclidean distance between two cells based on their spatial coordinates. Here's a breakdown:

What it's doing:
Python

'distance': np.linalg.norm(
    mouse_cells.iloc[i]['coords'] - mouse_cells.iloc[j]['coords']
)
This computes: √[(x₂-x₁)² + (y₂-y₁)² + (z₂-z₁)²]

Example:
Python

# If cell1 has coords [10, 20] and cell2 has coords [13, 24]
coords1 = np.array([10, 20])
coords2 = np.array([13, 24])

distance = np.linalg.norm(coords2 - coords1)
# = np.sqrt((13-10)² + (24-20)²)
# = np.sqrt(9 + 16)
# = np.sqrt(25)
# = 5.0
Why this is useful for correlation analysis:
Test spatial dependencies: Are nearby cells more correlated?
Python

# After getting all correlations with distances
corr_df = get_all_pairwise_correlations(df)

# Plot correlation vs distance
import matplotlib.pyplot as plt
plt.scatter(corr_df['distance'], corr_df['correlation'], alpha=0.5)
plt.xlabel('Distance between cells')
plt.ylabel('Correlation')
plt.title('Spatial dependency of neural correlations')
Identify functional clusters: High correlation + close distance = potential functional unit
Python

# Find highly correlated nearby cells
functional_pairs = corr_df[
    (corr_df['correlation'] > 0.5) & 
    (corr_df['distance'] < 50)  # within 50 pixels/microns
]
Control for distance in analyses:
Python

# Compare correlations at different distance ranges
distance_bins = pd.cut(corr_df['distance'], bins=5)
corr_by_distance = corr_df.groupby(distance_bins)['correlation'].agg(['mean', 'std', 'count'])
If your coords are stored differently:
Python

# If coords are stored as a list [x, y, z]
'distance': np.linalg.norm(
    np.array(mouse_cells.iloc[i]['coords']) - np.array(mouse_cells.iloc[j]['coords'])
)

# If coords are separate columns
'distance': np.sqrt(
    (mouse_cells.iloc[i]['x'] - mouse_cells.iloc[j]['x'])**2 +
    (mouse_cells.iloc[i]['y'] - mouse_cells.iloc[j]['y'])**2
)
This distance metric is particularly valuable in neuroscience because:

Local circuits: Nearby neurons often form functional connections
Spatial gradients: Some brain areas show gradual changes in response properties
Technical artifacts: Very close cells might show artificial correlations due to imaging artifacts
Would you like to see how to analyze the correlation-distance relationship in your data?

def get_all_pairwise_correlations(df):
    all_correlations = []
    
    for mouse_id in df['mouse_id'].unique():
        mouse_cells = df[df['mouse_id'] == mouse_id]
        
        if len(mouse_cells) > 1:
            trial_cols = [col for col in df.columns if col.startswith('Trial_')]
            corr_matrix = mouse_cells[trial_cols].T.corr()
            
            # Get all pairs
            for i in range(len(mouse_cells)):
                for j in range(i+1, len(mouse_cells)):
                    cell1_id = mouse_cells.iloc[i]['unique_cell_id']
                    cell2_id = mouse_cells.iloc[j]['unique_cell_id']
                    
                    all_correlations.append({
                        'mouse_id': mouse_id,
                        'cell1': cell1_id,
                        'cell2': cell2_id,
                        'correlation': corr_matrix.iloc[i, j],
                        'distance': np.linalg.norm(
                            mouse_cells.iloc[i]['coords'] - mouse_cells.iloc[j]['coords']
                        ) if 'coords' in mouse_cells.columns else None
                    })
    
    return pd.DataFrame(all_correlations)