**Data Import and Structuring**

In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
base_path = '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck/HomoCheck/CDKL5-E6D_T1_C1_05152024'
active_file_name = 'Compiled_ActivityScan.csv'
network_file_name = 'Compiled_Networks.csv'

In [37]:
active = pd.read_csv(f'{base_path}/{active_file_name}')
network = pd.read_csv(f'{base_path}/{network_file_name}')
# drop na
active = active.dropna()
network = network.dropna()

In [38]:
# Dropping the 'Run_ID' and 'Time' columns from both dataframes
activity_scan_clean = active.drop(['Run_ID', 'Time'], axis=1)
networks_clean = network.drop(['Run_ID', 'Time'], axis=1)

# Merge the two dataframes on specified keys
merged_df = pd.merge(activity_scan_clean, networks_clean, on=['DIV', 'Chip_ID', 'Well', 'NeuronType'], how='outer')

In [41]:
merged_df.head()

Unnamed: 0,DIV,Well,NeuronType,Chip_ID,Mean_FiringRate,Mean_SpikeAmplitude,Active_area,mean_IBI,cov_IBI,mean_Burst_Peak,cov_Burst_Peak,Number_Bursts,mean_Spike_per_Burst,cov_Spike_per_Burst,mean_Burst_Peak_Abs,cov_Burst_Peak_Abs,mean_BurstDuration,cov_BurstDuration,MeanNetworkISI,CoVNetworkISI,MeanWithinBurstISI,CoVWithinBurstISI,MeanOutsideBurstISI,CoVOutsideBurstISI,Fanofactor
0,5,1,MxHEMI,M07039,0.601701,72.853541,25.606061,11.74,52.305887,3.662474,113.861681,26.0,1694.0,129.106928,974.782407,162.874499,0.415385,0.0,1.267112,313.628048,0.073282,103.861536,0.357242,68.420284,178.884363
1,5,2,WT,M07039,1.005935,49.774901,15.848485,,,,,,,,,,,,,,,,,,
2,5,3,FxHET,M07039,0.668804,68.087166,14.712121,42.716667,77.475001,3.620016,80.39146,7.0,1498.571429,100.010203,652.269303,99.893396,0.314286,0.0,0.976505,287.602414,0.04819,92.642437,0.345847,69.80335,41.171271
3,5,4,MxHEMI,M07039,0.465627,74.338109,21.621212,17.62,39.863558,4.348048,98.804258,16.0,2083.8125,100.727789,624.089479,224.685341,0.5125,0.0,1.618887,295.330918,0.066445,107.517068,0.378431,66.455322,208.423889
4,5,5,WT,M07039,0.859133,48.574638,15.787879,,,,,,,,,,,,,,,,,,


In [39]:
unmatched_rows = merged_df[merged_df.isnull().any(axis=1)]
unmatched_rows

Unnamed: 0,DIV,Well,NeuronType,Chip_ID,Mean_FiringRate,Mean_SpikeAmplitude,Active_area,mean_IBI,cov_IBI,mean_Burst_Peak,cov_Burst_Peak,Number_Bursts,mean_Spike_per_Burst,cov_Spike_per_Burst,mean_Burst_Peak_Abs,cov_Burst_Peak_Abs,mean_BurstDuration,cov_BurstDuration,MeanNetworkISI,CoVNetworkISI,MeanWithinBurstISI,CoVWithinBurstISI,MeanOutsideBurstISI,CoVOutsideBurstISI,Fanofactor
1,5,2,WT,M07039,1.005935,49.774901,15.848485,,,,,,,,,,,,,,,,,,
4,5,5,WT,M07039,0.859133,48.574638,15.787879,,,,,,,,,,,,,,,,,,
7,5,2,WT,M07420,0.866217,62.207961,18.530303,,,,,,,,,,,,,,,,,,
17,5,6,WT,M07427,0.879758,61.943323,18.893939,,,,,,,,,,,,,,,,,,


In [67]:
import os
import pandas as pd

def merge_data_in_folder(base_path):
    directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

    for directory in directories:
        folder_path = os.path.join(base_path, directory)
        # Specify the file names
        activity_file = os.path.join(folder_path, 'Compiled_ActivityScan.csv')
        network_file = os.path.join(folder_path, 'Compiled_Networks.csv')

        # Initialize an empty list to collect DataFrames
        data_frames = []

        # Read the specified activity scan file if it exists
        if os.path.exists(activity_file):
            df_activity = pd.read_csv(activity_file)
            df_activity = df_activity.drop(['Run_ID', 'Time'], axis=1, errors='ignore')
            data_frames.append(df_activity)

        # Read the specified network file if it exists
        if os.path.exists(network_file):
            df_network = pd.read_csv(network_file)
            df_network = df_network.drop(['Run_ID', 'Time'], axis=1, errors='ignore')
            data_frames.append(df_network)

        # Merge all data frames
        if data_frames:
            combined_df = data_frames[0]
            for df in data_frames[1:]:
                combined_df = pd.merge(combined_df, df, on=['DIV', 'Chip_ID', 'Well', 'NeuronType'], how='outer')

            # Save the combined data to a CSV file
            combined_df.to_csv(os.path.join(folder_path, 'combined_data.csv'), index=False)

            # Identify unmatched rows and select specific columns
            unmatched_rows = combined_df[combined_df.isnull().any(axis=1)]
            unmatched_rows = unmatched_rows[['DIV', 'Well', 'NeuronType', 'Chip_ID']]

            # Print unmatched rows with specific columns
            if not unmatched_rows.empty:
                print(f"Unmatched rows in {directory}:")
                print(unmatched_rows)

# Specify the base path
base_path = '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck/HomoCheck'
merge_data_in_folder(base_path)

Unmatched rows in CDKL5-E6D_T1_C1_05152024:
    DIV  Well NeuronType Chip_ID
1     5     2         WT  M07039
4     5     5         WT  M07039
7     5     2         WT  M07420
17    5     6         WT  M07427
30    5     2         WT  M08018
31    5     2         WT  M08018
35    5     2         WT  M08018
36    5     2         WT  M08018
40    5     3      FxHET  M08018
45    5     3      FxHET  M08018
50    5     4     MxHEMI  M08018
55    5     4     MxHEMI  M08018
58    5     5         WT  M08018
59    5     5         WT  M08018
60    5     5         WT  M08018
61    5     5         WT  M08018
63    5     5         WT  M08018
64    5     5         WT  M08018
65    5     5         WT  M08018
66    5     5         WT  M08018
70    5     6      FxHET  M08018
75    5     6      FxHET  M08018
Unmatched rows in SYNGAP1_T1_C1_03212024:
     DIV  Well NeuronType Chip_ID
0      4     1         WT  M06691
1      4     2         WT  M06691
2      4     3         WT  M06691
3      4     4     

In [52]:
import os
import pandas as pd

def combine_all_data(base_path):
    # Initialize an empty DataFrame to store all combined data
    all_data = pd.DataFrame()

    # List all directories in the specified path
    directories = [d for d in os.listdir(base_path) if os.path.isdir(os.path.join(base_path, d))]

    for directory in directories:
        folder_path = os.path.join(base_path, directory)
        combined_data_path = os.path.join(folder_path, 'combined_data.csv')
        
        # Check if combined_data.csv exists in the folder
        if os.path.exists(combined_data_path):
            df = pd.read_csv(combined_data_path)
            # Add a new column 'Trial' with the folder name
            # Ensure 'Trial' is inserted right after 'Chip_ID'
            if 'Chip_ID' in df.columns:
                # Find index of 'Chip_ID' column
                loc = df.columns.get_loc('Chip_ID') + 1
                # Insert 'Trial' column right after 'Chip_ID'
                df.insert(loc, 'Trial', directory)
            else:
                df['Trial'] = directory  # Fallback if 'Chip_ID' is not in the columns

            # Append the DataFrame to the all_data DataFrame
            all_data = pd.concat([all_data, df], ignore_index=True)
        else:
            print(f"No combined_data.csv found in {directory}")

    # Save the final combined DataFrame to a CSV file
    all_data.to_csv(os.path.join(base_path, 'all_combined_data.csv'), index=False)

    # Print the shape of the final DataFrame to confirm the number of rows and columns
    print(f"Final combined data shape: {all_data.shape}")
    return all_data

# Specify the base path
base_path = '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/QualityCheck/HomoCheck'
data = combine_all_data(base_path)

Final combined data shape: (2550, 29)


In [62]:
data.head()

Unnamed: 0,DIV,Well,NeuronType,Chip_ID,Trial,Mean_FiringRate,Mean_SpikeAmplitude,Active_area,mean_IBI,cov_IBI,mean_Burst_Peak,cov_Burst_Peak,Number_Bursts,mean_Spike_per_Burst,cov_Spike_per_Burst,mean_Burst_Peak_Abs,cov_Burst_Peak_Abs,mean_BurstDuration,cov_BurstDuration,MeanNetworkISI,CoVNetworkISI,MeanWithinBurstISI,CoVWithinBurstISI,MeanOutsideBurstISI,CoVOutsideBurstISI,Fanofactor,Burst_Peak_Abs,BurstDuration,Burst_Peak_Normalized
0,5,1,MxHEMI,M07039,CDKL5-E6D_T1_C1_05152024,0.601701,72.853541,25.606061,11.74,52.305887,3.662474,113.861681,26.0,1694.0,129.106928,974.782407,162.874499,0.415385,0.0,1.267112,313.628048,0.073282,103.861536,0.357242,68.420284,178.884363,,,
1,5,2,WT,M07039,CDKL5-E6D_T1_C1_05152024,1.005935,49.774901,15.848485,44.7,0.0,4.912981,102.508287,2.0,,,1069.009668,42.786425,,,0.740302,289.047687,,,,,12.371173,,,
2,5,3,FxHET,M07039,CDKL5-E6D_T1_C1_05152024,0.668804,68.087166,14.712121,42.716667,77.475001,3.620016,80.39146,7.0,1498.571429,100.010203,652.269303,99.893396,0.314286,0.0,0.976505,287.602414,0.04819,92.642437,0.345847,69.80335,41.171271,,,
3,5,4,MxHEMI,M07039,CDKL5-E6D_T1_C1_05152024,0.465627,74.338109,21.621212,17.62,39.863558,4.348048,98.804258,16.0,2083.8125,100.727789,624.089479,224.685341,0.5125,0.0,1.618887,295.330918,0.066445,107.517068,0.378431,66.455322,208.423889,,,
4,5,5,WT,M07039,CDKL5-E6D_T1_C1_05152024,0.859133,48.574638,15.787879,,,8.383602,0.0,1.0,,,883.220707,50.78876,,,0.866212,285.151705,,,,,14.770573,,,


In [55]:
# select WT data
wt_data = data[data['NeuronType'] == 'WT']
wt_data.shape
# 

(1206, 29)

In [63]:
wt_data

Unnamed: 0,DIV,Well,NeuronType,Chip_ID,Trial,Mean_FiringRate,Mean_SpikeAmplitude,Active_area,mean_IBI,cov_IBI,mean_Burst_Peak,cov_Burst_Peak,Number_Bursts,mean_Spike_per_Burst,cov_Spike_per_Burst,mean_Burst_Peak_Abs,cov_Burst_Peak_Abs,mean_BurstDuration,cov_BurstDuration,MeanNetworkISI,CoVNetworkISI,MeanWithinBurstISI,CoVWithinBurstISI,MeanOutsideBurstISI,CoVOutsideBurstISI,Fanofactor,Burst_Peak_Abs,BurstDuration,Burst_Peak_Normalized
1,5,2,WT,M07039,CDKL5-E6D_T1_C1_05152024,1.005935,49.774901,15.848485,44.7,0.0,4.912981,102.508287,2.0,,,1069.009668,42.786425,,,0.740302,289.047687,,,,,12.371173,,,
4,5,5,WT,M07039,CDKL5-E6D_T1_C1_05152024,0.859133,48.574638,15.787879,,,8.383602,0.0,1.0,,,883.220707,50.78876,,,0.866212,285.151705,,,,,14.770573,,,
6,5,1,WT,M07420,CDKL5-E6D_T1_C1_05152024,0.966328,60.385641,23.863636,29.1,106.383339,3.360783,131.467204,8.0,1767.875,132.433099,1039.799936,80.635453,0.6375,0.0,0.977964,351.166363,0.110062,129.623709,0.332086,71.646849,41.721689,,,
7,5,2,WT,M07420,CDKL5-E6D_T1_C1_05152024,0.866217,62.207961,18.530303,180.0,0.0,11.425463,6.181592,2.0,,,880.343671,106.199782,,,1.050997,388.106047,,,,,55.491613,,,
8,5,3,WT,M07420,CDKL5-E6D_T1_C1_05152024,0.973242,59.716902,18.030303,66.9,75.009672,5.474604,100.44748,5.0,2905.4,90.015728,916.495123,99.684316,0.82,0.0,0.96318,344.97174,0.083943,159.761376,0.331156,73.224236,56.76198,,,
12,5,1,WT,M07427,CDKL5-E6D_T1_C1_05152024,0.798251,65.478284,22.666667,23.77,51.746867,3.70795,118.3597,11.0,1658.363636,122.9152,1013.023434,101.68397,0.454545,0.0,1.094039,319.772731,0.07377,134.018509,0.361726,70.26719,61.884164,,,
13,5,2,WT,M07427,CDKL5-E6D_T1_C1_05152024,0.793323,63.881626,19.287879,28.683333,55.078259,2.89551,114.612929,7.0,1038.428571,152.441449,1047.532318,58.135328,0.314286,0.0,0.892005,274.592801,0.081997,139.198382,0.308592,80.456337,21.986679,,,
14,5,3,WT,M07427,CDKL5-E6D_T1_C1_05152024,0.858721,62.359941,21.848485,18.313333,53.918088,3.276439,105.171744,16.0,2826.5,93.276179,1224.428752,79.152745,1.64375,0.0,0.890179,304.319608,0.20567,105.448572,0.341332,70.764376,47.153915,,,
17,5,6,WT,M07427,CDKL5-E6D_T1_C1_05152024,0.879758,61.943323,18.893939,137.1,0.0,6.039422,104.897694,2.0,,,1148.101992,50.794781,,,0.804323,260.237687,,,,,18.026715,,,
28,5,2,WT,M08018,CDKL5-E6D_T1_C1_05152024,0.790435,60.064061,15.166667,88.8,104.43853,4.768413,84.635785,4.0,2058.5,99.997578,882.137564,72.028087,0.325,0.0,0.840752,270.209433,0.043369,93.062177,0.342234,73.736351,28.836673,,,


In [58]:
# check div 21
# Assuming 'Trial' is a column, find unique trials
trials = data['Trial'].unique()

# Define the DIV range
div_range = range(19, 26)  # This will include 20, 21, 22, 23, 24, 25

# Prepare to check for DIV presence in each trial
results = {}

for trial in trials:
    # Filter the data for the current trial
    trial_data = data[data['Trial'] == trial]
    present_divs = [div for div in div_range if any(trial_data['DIV'] == div)]
    
    # Store the results with a format
    if present_divs:
        results[trial] = f"div {', '.join(map(str, present_divs))} present in trial {trial}"
    else:
        results[trial] = f"No divs from {div_range.start} to {div_range.stop - 1} present in trial {trial}"

# Output the formatted results
for trial, result in results.items():
    print(result)

div 20, 23 present in trial CDKL5-E6D_T1_C1_05152024
div 19, 22 present in trial SYNGAP1_T1_C1_03212024
div 21, 25 present in trial ADNP_T2_10262023
div 21, 24 present in trial KCNT1_T3_C1_03122024
div 20, 23 present in trial SPTAN1_T1_07192023
div 21, 24 present in trial KCNT1_T1_08082023
div 21, 24 present in trial ADNP_T3_11072023
div 20, 23 present in trial SHANK3_T1_11222023
div 21, 25 present in trial CHD8_T2_C1_08252023


In [59]:
# Unique trials
trials = wt_data['Trial'].unique()

# Prepare a DataFrame to store selected data
selected_data = pd.DataFrame()

# Store DIV selection for each trial
div_selections = {}

for trial in trials:
    # Data for the current trial
    trial_data = wt_data[wt_data['Trial'] == trial]

    # Check if DIV 21 is present
    if (trial_data['DIV'] == 21).any():
        # Select DIV == 21 data
        selected_rows = trial_data[trial_data['DIV'] == 21]
        div_selections[trial] = 21
    else:
        # Find the DIV closest to 21
        closest_div = trial_data.iloc[(trial_data['DIV'] - 21).abs().argsort()[:1]]
        selected_rows = trial_data[trial_data['DIV'] == closest_div['DIV'].values[0]]
        div_selections[trial] = closest_div['DIV'].values[0]

    # Append the selected rows to the final DataFrame
    selected_data = pd.concat([selected_data, selected_rows], ignore_index=True)

# Print the DIV selected for each trial
for trial, div in div_selections.items():
    print(f"Selected DIV {div} for trial {trial}")

Selected DIV 20 for trial CDKL5-E6D_T1_C1_05152024
Selected DIV 22 for trial SYNGAP1_T1_C1_03212024
Selected DIV 21 for trial ADNP_T2_10262023
Selected DIV 21 for trial KCNT1_T3_C1_03122024
Selected DIV 20 for trial SPTAN1_T1_07192023
Selected DIV 21 for trial KCNT1_T1_08082023
Selected DIV 21 for trial ADNP_T3_11072023
Selected DIV 20 for trial SHANK3_T1_11222023
Selected DIV 21 for trial CHD8_T2_C1_08252023


In [60]:
selected_data

Unnamed: 0,DIV,Well,NeuronType,Chip_ID,Trial,Mean_FiringRate,Mean_SpikeAmplitude,Active_area,mean_IBI,cov_IBI,mean_Burst_Peak,cov_Burst_Peak,Number_Bursts,mean_Spike_per_Burst,cov_Spike_per_Burst,mean_Burst_Peak_Abs,cov_Burst_Peak_Abs,mean_BurstDuration,cov_BurstDuration,MeanNetworkISI,CoVNetworkISI,MeanWithinBurstISI,CoVWithinBurstISI,MeanOutsideBurstISI,CoVOutsideBurstISI,Fanofactor,Burst_Peak_Abs,BurstDuration,Burst_Peak_Normalized
0,20,2,WT,M07039,CDKL5-E6D_T1_C1_05152024,1.983021,97.197132,88.590909,2.658929,45.160749,9.958695,35.145689,113.0,5017.610619,66.797908,4705.70372,110.3964,0.472566,0.0,0.398187,274.921896,0.049217,217.271256,0.158023,129.098404,1073.250631,,,
1,20,5,WT,M07039,CDKL5-E6D_T1_C1_05152024,1.613199,89.469344,83.454545,3.682716,51.841482,9.131061,38.610374,82.0,4154.536585,50.12364,3373.967648,128.181803,0.457317,0.0,0.516129,296.456617,0.046678,198.192406,0.149487,127.401742,823.077234,,,
2,20,1,WT,M07420,CDKL5-E6D_T1_C1_05152024,1.781625,112.321619,93.515152,2.69,41.640164,8.48739,25.743467,111.0,4231.513514,190.171766,4099.740491,105.907697,0.436036,0.0,0.427575,272.28404,0.04659,249.315931,0.139338,131.124133,978.663563,,,
3,20,2,WT,M07420,CDKL5-E6D_T1_C1_05152024,2.140018,123.103925,88.878788,3.117708,45.617774,8.111178,22.958461,97.0,3217.453608,34.212383,3537.079643,99.196303,0.396907,0.0,0.489701,261.179964,0.033181,242.328415,0.147725,127.939483,795.637149,,,
4,20,3,WT,M07420,CDKL5-E6D_T1_C1_05152024,2.475439,105.979421,95.348485,2.406504,47.68484,7.40586,47.119325,124.0,6758.491935,209.015582,4124.119538,102.349998,0.73629,0.0,0.37648,299.768289,0.058735,192.957174,0.17321,117.009066,788.972594,,,
5,20,1,WT,M07427,CDKL5-E6D_T1_C1_05152024,2.200255,145.514002,96.30303,3.111458,35.912728,11.510904,31.371954,97.0,5369.896907,49.061096,4888.271384,118.416635,0.442268,0.0,0.412182,257.702921,0.041301,204.81625,0.165623,124.378529,1366.752653,,,
6,20,2,WT,M07427,CDKL5-E6D_T1_C1_05152024,1.882157,138.316538,94.878788,3.571084,41.193786,7.936748,40.584734,84.0,4165.821429,139.984038,3185.058542,112.314177,0.47619,0.0,0.484818,292.086727,0.044428,207.63522,0.126562,141.852358,737.108663,,,
7,20,3,WT,M07427,CDKL5-E6D_T1_C1_05152024,2.349175,125.155014,95.075758,1.963816,38.18949,9.680759,25.517591,153.0,4568.24183,50.141975,6100.734712,79.658652,0.453595,0.0,0.333632,254.72329,0.051107,242.665334,0.14546,138.449186,1169.900957,,,
8,20,6,WT,M07427,CDKL5-E6D_T1_C1_05152024,3.316905,123.754177,97.075758,2.75,38.433849,9.723359,31.231607,109.0,4398.853211,49.652401,4501.66904,107.526411,0.422936,0.0,0.419413,274.151688,0.043553,239.484555,0.144888,131.379835,990.476456,,,
9,20,2,WT,M08018,CDKL5-E6D_T1_C1_05152024,2.388229,93.072929,87.0,2.60354,44.363834,7.182515,36.763972,114.0,3425.640351,57.634858,4086.64518,77.841115,0.526316,0.0,0.363498,304.981703,0.054926,197.27702,0.128999,144.527057,399.141805,,,


**Exploratory Data Analysis (EDA)**

Selected features: Activity area, Spike Amplitude, Firing Rate, network: ..., ISI

**Statistical Testing for Homogeneity**

***Assessment of Normality***

Shapiro-Wilk Test or Kolmogorov-Smirnov Test can be used to assess whether each feature follows a normal distribution across trials. This will help determine whether parametric or non-parametric tests should be applied.

***Assessment of Homogeneity of Variance***

Use tests like Levene’s Test or Bartlett’s Test to check the homogeneity of variances across trials, which is an assumption for ANOVA.

***Multivariate Analysis of Variance (MANOVA)***

MANOVA can be used to assess the differences across trials on a combination of dependent variables. It helps determine if the vector of means of the dependent variables is the same across trials.

If assumptions above (normality, homogeneity of variances) are not violated, continue with MANOVA. If the assumptions of ANOVA are violated, consider using non-parametric alternatives like the Kruskal-Wallis test.

***PCA (Principal Component Analysis)***

to explore overall feature variation between trials

**Post-hoc Testing (Turkey's test)**

If ANOVA or Kruskal-Wallis shows significant differences, follow up with post-hoc tests like Tukey’s HSD (for ANOVA) or Dunn’s test (for Kruskal-Wallis) to pinpoint which trials are significantly different.

**Optional**

Between-study heterogeneity was assessed and quantified by Cochran’s Q-statistic along with corresponding p-values and Higgins I 2 test (Higgins and Thompson 2002). The p-value of Q-statistic ≥ 0.05 indicated insignificant heterogeneity among all studies, and the value of I 2 over 25%, 50%, and 75% implied low, moderate, and high levels of heterogeneity, respectively (Higgins et al. 2003).

**Visualization**

Use boxplots, scatter plots, or violin plots to visually compare the distribution of each feature across trials.
Consider plotting confidence intervals to see how much overlap there is between trials.