In [None]:
# Native libraries
import os
import math
import glob
# Essential Libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
# Preprocessing
from sklearn.preprocessing import MinMaxScaler,MaxAbsScaler
# Algorithms
from minisom import MiniSom

Load the dataframe

In [None]:
all_data_df = pd.read_csv(r'')

Calculate time from frame number, trim first 15 minutes

In [None]:
all_data_df['time'] = (all_data_df['frame'] - 1) * 20
print(all_data_df['time'].unique())

# trim to min_time and max_time
min_time = 0
max_time = 900
all_data_df = all_data_df[(all_data_df['time'] >= min_time) & (all_data_df['time'] <= max_time)]
print(all_data_df['time'].unique())

print(f"all_data_df rows: {len(all_data_df)}")

Remove cells farther than 200 um

In [None]:
# Remove cells farther than 200 um
all_data_df = all_data_df[all_data_df['dmap_um_laser'] <= 200]
print(f"all_data_df rows after removing cells farther than 200 um: {len(all_data_df)}")

Drop all cells that don't appear in every frame

In [None]:
# Calculate the expected number of frames for each cell

expected_frames = (max_time - min_time) // 20 + 1

# Keep only unique_cell_id groups that appear in every frame

all_data_df = all_data_df.groupby('unique_cell_id').filter(lambda x: len(x) == expected_frames)

print(f"all_data_df rows: {len(all_data_df)}")

Get normalized green/red deltaF/F0 values

In [None]:
all_data_df['greenperred'] = all_data_df['intensity_mean_GFP'] / all_data_df['intensity_mean_RFP']

display(all_data_df[[ 'intensity_mean_GFP', 'intensity_mean_RFP', 'greenperred']].head())

In [None]:
baseline_max_frame = 15

# Get greenperred baseline values between 0 and baseline_max_frame

baseline_df = all_data_df[all_data_df['frame'] <= baseline_max_frame].copy()
baseline_df['greenperred_F0'] = baseline_df.groupby('unique_cell_id')['greenperred'].transform('mean')
baseline_df = baseline_df[['unique_cell_id', 'greenperred_F0']].drop_duplicates()

# Assign the baseline values to the original DataFrame

all_data_df['greenperred_F0'] = all_data_df['unique_cell_id'].map(baseline_df.set_index('unique_cell_id')['greenperred_F0'])

# Calculate delataF for greenperred

all_data_df['greenperred_dF'] = all_data_df['greenperred'] - all_data_df['greenperred_F0']

# Calculate deltaF/F0 for greenperred
all_data_df['gpr_dF/F0'] = all_data_df['greenperred_dF'] / all_data_df['greenperred_F0']


# Display the first few rows of the updated DataFrame
display(all_data_df[['unique_cell_id', 'time', 'greenperred', 'greenperred_F0', 'gpr_dF/F0']].head())

Set is_iso and is_itga indexes

In [None]:
#for each cell check if unique_cell_id contains 'itga' -> set is_itga to True, else False
all_data_df['is_itga'] = all_data_df['unique_cell_id'].str.contains('x1172')
#for each cell check if unique_cell_id contains 'iso' -> set is_iso to True, else False
all_data_df['is_iso'] = all_data_df['unique_cell_id'].str.contains('iso')

In [None]:
all_data_df[["is_iso","is_itga","unique_cell_id"]].drop_duplicates().groupby(["is_iso","is_itga"]).count()

Check if there are NaN values

In [None]:
#check if there are NaN values in gpr_dF/F0
nan_gpr_dF_F0 = all_data_df[all_data_df['gpr_dF/F0'].isna()]
print(f"Number of NaN values in gpr_dF/F0: {len(nan_gpr_dF_F0)}")

Filter cells: (after wounding) 0.5<= dF/F0 <10 (at any point)

In [None]:
# Find all unique_cell_id values where gpr_dF/F0 > 10 at any timepoint

cells_to_remove = all_data_df.loc[all_data_df['gpr_dF/F0'] > 10, 'unique_cell_id'].unique()

# Remove all rows for those cells

all_data_max_value_filtered_df = all_data_df[~all_data_df['unique_cell_id'].isin(cells_to_remove)].reset_index(drop=True)

In [None]:
# Filter for frames after the 15th (i.e., frame > 15)
after_15_df = all_data_max_value_filtered_df[all_data_max_value_filtered_df['frame'] > 15]

# Find unique_cell_id that reach threshold at any point after frame 15
cells_with_peak = after_15_df[after_15_df['gpr_dF/F0'] >= 0.5]['unique_cell_id'].unique()

# Keep only those cells in the dataframe
max_and_min_filtered_df = all_data_max_value_filtered_df[all_data_max_value_filtered_df['unique_cell_id'].isin(cells_with_peak)].reset_index(drop=True)

CLUSTERING DATA INPUT

In [None]:
data_for_clustering = max_and_min_filtered_df.copy()

In [None]:
needed_cols=["time","gpr_dF/F0"]
mySeriesData=[]
mySeriesName=[]
null_cell = []

print(len(data_for_clustering["unique_cell_id"].unique()))

for u_cell in data_for_clustering["unique_cell_id"].unique():
    data_curr = data_for_clustering[data_for_clustering.unique_cell_id==u_cell][needed_cols].copy()
    data_curr.columns =["time","value"]
    if any(data_curr["value"].isnull()):
        null_cell.append(u_cell)
        continue
    
    mySeriesData.append(data_curr.set_index("time"))
    mySeriesName.append(u_cell)

In [None]:
#check what unique time series lengths are in mySeriesData
unique_lengths = set(len(series) for series in mySeriesData)
print(f"Unique time series lengths in mySeriesData: {unique_lengths}")

#print all unique time series lengths
for length in unique_lengths:
    print(f"Time series length: {length} - Number of cells: {sum(len(series) == length for series in mySeriesData)}")

Normalizing data before clustering

In [None]:
mySeries=[]
for i in range(len(mySeriesData)):
    scaler = MinMaxScaler()
    curr = MinMaxScaler().fit_transform(mySeriesData[i])
    mySeries.append(curr.reshape(len(curr)))

ACTUAL MINISOM CLUSTERING

In [None]:
som_x = 2
som_y = 2
som = MiniSom(som_x, som_y, len(mySeries[0]), sigma=0.3, learning_rate=0.5, random_seed=42)

som.random_weights_init(mySeries)
som.train(mySeries, 50000, use_epochs=True)

MAPPING CLUSTER NUMBERS TO CELLS

In [None]:
# Assign each cell to a cluster
cluster_assignments = []
for i, series in enumerate(mySeries):
    winner = som.winner(series)  # (x, y) coordinates of the winning node
    cluster_assignments.append({'unique_cell_id': mySeriesName[i], 'cluster_x': winner[0], 'cluster_y': winner[1]})

# Convert to DataFrame
cluster_df = pd.DataFrame(cluster_assignments)

# Save to CSV
cluster_df.to_csv(r'', index=False)

In [None]:
cluster_map = []
for idx in range(len(mySeries)):
    winner_node = som.winner(mySeries[idx])
    cluster_map.append((mySeriesName[idx],f"Cluster {winner_node[0]*som_y+winner_node[1]+1}"))

name_cluster= pd.DataFrame(cluster_map,columns=["Series","Cluster"]).sort_values(by="Cluster")

In [None]:
sixteen_cluster_dataframe = data_for_clustering.merge(name_cluster,left_on ="unique_cell_id",right_on="Series",how="left")

display(sixteen_cluster_dataframe.head())

In [None]:
sixteen_cluster_dataframe.to_csv(r'', index=False)