### Non-uniform temporal scaling of developmental processes in mammalian cortex
#### Paolino et al. 2023

This code corresponds to the compositional statistical tests used in Paolino et al. 2023 submitted for review to Nature Communications. Other statistical tests are not described here, as they used external Python and R scripts detailed in the manuscript (see Methods) and reported in the Supplementary Statistics Table 1. 

In [10]:
# Import relevant packages
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math 
import pandas as pd
import pingouin as pg
import scipy.stats as stats
from scipy.stats import zscore
from sklearn.utils import resample
from skbio.stats.composition import clr, ilr
from statsmodels.multivariate.manova import MANOVA
from statsmodels.formula.api import ols

In [None]:
# Import data from the master CSV file.
sheet = 'name_of_relevant_excel_tab' # Change this to the name of the tab in the master Excel file.
comp_df = pd.read_excel('Paolino2023_master_data.xlsx', sheet_name=sheet)
comp_df.head()

In [None]:
# To avoid errors working with 0 percentage/proportion values, a small value is added to all zeros and evenly subtracted from the non-zero values
def adjust_values(series):
    # Define the small value to add
    small_value = 0.0001
    # Find the zero values
    zero_values = series == 0
    # Add the small value to the zero values
    series[zero_values] += small_value
    # Calculate the total added amount
    total_added = small_value * zero_values.sum()
    # Check if there are any non-zero values
    if (~zero_values).sum() > 0:
        # Subtract the total added amount evenly from the non-zero values
        series[~zero_values] -= total_added / (~zero_values).sum()
    return series

# Apply the function to each group of your DataFrame
# Assuming 'group_column' is the name of the column you want to group by
comp_df['prop_cells_reg'] = comp_df.groupby(['Species','Cortex'])['Prop_Ctx'].transform(adjust_values)
comp_df.head()

In [None]:
# Isometric log ratio transforms calculated from the proportions.
ilr_dfs = {'Species':[],'Cortex': [],'ILR_p1':[],'ILR_p2':[],'ILR_p3':[],'ILR_p4':[]}

for name, data in comp_df.groupby(['Species','Cortex']):
    ilr_dfs['Species'].append(name[0])
    ilr_dfs['Cortex'].append(name[1])
    prop_by_ctxandlayer = data['prop_cells_reg'].values
    print(name, np.sum(prop_by_ctxandlayer))
    ilr_prop_by_ctxandlayer = ilr(prop_by_ctxandlayer)
    # m - 1 the number of parts in the composition, could be more or less depending on specific comparison being made.
    ilr_dfs['ILR_p1'].append(ilr_prop_by_ctxandlayer[0])
    ilr_dfs['ILR_p2'].append(ilr_prop_by_ctxandlayer[1])
    ilr_dfs['ILR_p3'].append(ilr_prop_by_ctxandlayer[2])
    ilr_dfs['ILR_p4'].append(ilr_prop_by_ctxandlayer[3])

# Convert to a pandas dataframe
ilr_dfs = pd.DataFrame(ilr_dfs)
ilr_dfs

In [None]:
# Next, we calculate residuals and add them to the dataframe. 
ilr_dfs['p1_residuals'] = ols('ILR_p1 ~ C(Species)', data=ilr_dfs).fit().resid
ilr_dfs['p2_residuals'] = ols('ILR_p2 ~ C(Species)', data=ilr_dfs).fit().resid
ilr_dfs['p3_residuals'] = ols('ILR_p3 ~ C(Species)', data=ilr_dfs).fit().resid
ilr_dfs['p4_residuals'] = ols('ILR_p4 ~ C(Species)', data=ilr_dfs).fit().resid  
ilr_dfs.head()

In [None]:
# Example of assumptions tests for multivariate normality and homogeneity of covariance matrices. 
pg.multivariate_normality(ilr_dfs[['p1_residuals','p2_residuals','p3_residuals','p4_residuals']])
pg.box_m(ilr_dfs, dvs=['p1_residuals','p2_residuals','p3_residuals','p4_residuals'], group='Species')

In [None]:
# For parametric tests, we use MANOVA or switch to a non-parametric test in R (see Methods).
maov = MANOVA.from_formula('ILR_p1 + ILR_p2 + ILR_p3 + ILR_p4 ~ Species', data=ilr_dfs)
print(maov.mv_test())

In [None]:
# Calculate log ratio of geometric means and plot the result.

def calc_log_ratio(data):
    # Resample the data with replacement
    boot_data = data.sample(n=len(data), replace=True)
    
    # Split the data by condition
    Mm = boot_data[boot_data['Species'] == 'Ms']
    Sc = boot_data[boot_data['Species'] == 'FTD']
    
    # Calculate geometric means
    gm_mm = np.exp(np.mean(np.log(Mm['prop_cells_reg'])))
    gm_sc = np.exp(np.mean(np.log(Sc['prop_cells_reg'])))

    return np.log(gm_mm/gm_sc)

# Apply the bootstrapping function for each layer
results = {}
for layer in comp_df['Layer'].unique(): # Loop through each layer
    layer_data = comp_df[comp_df['Layer'] == layer] # Get data for the layer
    print(layer_data)
    boot_results = [calc_log_ratio(layer_data) for _ in range(5000)] # Run bootstrapping
    results[layer] = boot_results # Store results in dictionary

# Convert results to DataFrame and drop rows with NaN values
results_df = pd.DataFrame(results)
results_df = results_df.dropna()

# Calculate means for each layer
means = results_df.mean()

# Calculate 95% confidence intervals for each layer
ci_lower = results_df.apply(lambda x: np.percentile(x, 2.5))
ci_upper = results_df.apply(lambda x: np.percentile(x, 97.5))

# Calculate 99% confidence intervals for each layer
# ci_lower = results_df.apply(lambda x: np.percentile(x, 0.5))
# ci_upper = results_df.apply(lambda x: np.percentile(x, 99.5))

# Create bar plot with error bars
plt.figure(figsize=(2,4))
plt.bar(means.index, means.values, yerr=[means.values-ci_lower.values, ci_upper.values-means.values], capsize=4,color=['#009392','#EEB479','#E88471','#F2C6DE'])
plt.xlabel('Staining')
plt.ylabel('Log ratio (Mm/Sc)')
plt.axhline(0,linestyle='--',color='black')
plt.xticks(rotation=45)
sns.despine()
plt.show()