# This pynb files contains code for following tasks

1. Cell proportion counts

2. Kruskal wllis test

3. Linear mixed model for counts/proportions using lme4

4. scCODA method

# Cell proportions based on brain region

In [None]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata 

### read the anndata after celltype annotation

bdata=sc.read_h5ad("path/dataset.h5ad")

#### subset the data for neuronal population

Neuron=bdata[bdata.obs['major_celltypes'].isin(["Excitatory","Inhibitory"])]


#### subset the data based on Brain Region

BA9=Neuron[Neuron.obs["Brain.Region"] == "Frontal Cx (BA9)"]
BA7=Neuron[Neuron.obs["Brain.Region"] == "Precuneous (BA7)"]
BA17=Neuron[Neuron.obs["Brain.Region"] == "Primary Visual Cx (BA17)"]


#### plot the proportion for each brain region

# count the number of occurrences  for each cluster ID
count = BA9.obs.groupby(['Disease.Group', 'Author_Annotation']).size()
new_df = count.to_frame(name = 'size').reset_index()

# Pivot and rename columns
freq = new_df.pivot(index='Disease.Group', columns='Author_Annotation')['size']
list2 = ["Ex1", "Ex2", "Ex3", "Ex4", "Ex5", "Ex6", "Ex7", "Ex8", "Ex9", "Ex10", "Ex11", "Ex12", "Ex13", "Ex14", "Ex15", "Ex16", "Ex17", "Ex18", 
         "In1", "In2", "In3", "In4", "In5", "In6", "In7", "In8", "In9", "In10", "In11", "In12", "In13", "In14", "In15", "In16", "In17", "In18", "In19"]
freq.columns = list2

# Normalize
percent = freq.div(freq.sum(axis=1), axis=0).T
percent['Clusters'] = percent.index

# Select Disease.Groups
T_new = percent[['low', 'int', 'high', 'Clusters']]

# Standard error 
se_new = T_new[['low', 'int', 'high']].apply(lambda col: np.std(col, ddof=1) / np.sqrt(len(col)), axis=0)

# Plot 
plt.rcParams.update({'font.size': 28})
T_new.plot(x='Clusters', kind='bar', figsize=(30, 7), width=0.8, color=['cyan', 'blue', 'red'], 
           edgecolor='black', capsize=2, title='Proportion of Neuronal Cell Types in BA9')
plt.legend(bbox_to_anchor=(1, 1), loc='upper left', borderaxespad=5)
plt.ylabel("Cell Proportion", fontsize=22)

# Error bars
x_positions = np.arange(len(T_new['Clusters']))
for i, col in enumerate(['low', 'int', 'high']):
    plt.errorbar(x_positions + i * 0.3 - 0.3, T_new[col], yerr=se_new[i], fmt='none', color='black', capsize=3)

plt.show()


# Kruskal wallis test

In [None]:
import statsmodels.api as sm
import pandas as pd
import numpy as np
import statsmodels.stats.multitest as smm
import scipy.stats as stats
import warnings

warnings.filterwarnings("ignore")

# Calculate count based on grouping variables
count = BA9.obs.groupby(['subject', 'Disease.Group', 'Author_Annotation']).size()

# Convert count to a DataFrame
new_df = count.to_frame(name='size').reset_index()

freq = new_df.pivot(index=['subject','Disease.Group'], columns= 'Author_Annotation', values='size')
# Calculate the sum of each row
freq['row_sum'] = freq.sum(axis=1)

# Remove rows with sum equal to 0
freq = freq[freq['row_sum'] != 0]

# Remove the 'row_sum' column
freq = freq.drop('row_sum', axis=1)

for i in range(0,37):
    cluster=freq.iloc[:,[i]]
    sorted_df = cluster.sort_values('Disease.Group')
    series=sorted_df.iloc[:,0]
    sorted_df.reset_index(inplace=True)
    sorted_df.rename(columns={'index': 'Disease.Group'}, inplace=True)
    transpose=sorted_df.T
    cell=transpose.iloc[2,:].tolist()
    disease=sorted_df["Disease.Group"].tolist()



    disease_stage = disease # Disease stage categories
    cell_count = cell  # Cell counts for each subject

# Create a DataFrame
data = pd.DataFrame({'Disease_Stage': disease_stage, 'Cell_Count': cell_count})

# Split data by disease groups
groups = {
    'low': data[data['Disease_Stage'] == 'low']['Cell_Count'],
    'int': data[data['Disease_Stage'] == 'int']['Cell_Count'],
    'high': data[data['Disease_Stage'] == 'high']['Cell_Count']
}

#  Kruskal-Wallis test
statistic, p_value = stats.kruskal(groups['low'], groups['int'], groups['high'])

#  FDR correction
p_adjusted = smm.multipletests([p_value], method='fdr_bh')[1]  # Wrap p_value in a list
print(freq.columns[i], p_adjusted)


# LMM model 

# calculate the proportions

In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import statsmodels.api as sm
from scipy.sparse import hstack

# List of unique cell types in your data
unique_cell_types = BA9.obs['Author_Annotation'].unique()

# Create an empty DataFrame to store cell proportions
cell_proportions_df = pd.DataFrame(columns=['CellType', 'Subject','Assay', 'Disease_Group', 'Proportion'])

# Iterate through each cell type and specify the brain region (BA9,BA17,BA7)
for cell_type in unique_cell_types:
    cell_type_data = BA9[BA9.obs['Author_Annotation'] == cell_type]

    # Group by subject and disease group and calculate cell proportions
    groupby_cols = ['subject', 'Disease_Group','Assay']
    grouped = cell_type_data.obs.groupby(groupby_cols)
    
    for (subject, disease_group,Assay), group_df in grouped:
        total_cells = len(group_df)
        proportion = total_cells / len(cell_type_data)
        cell_proportions_df = cell_proportions_df.append({
            'CellType': cell_type,
            'Subject': subject,
            'Assay': Assay,
            'Disease_Group': disease_group,
            'Proportion': total_cells,
                    }, ignore_index=True)



# lme4 mixed model for counts

In [None]:
library("lme4")
library("dplyr")

#### input the proportions speciifc to BA9, BA17, BA7
data<-cell_proportions_df
celltypes<-unique(data$CellType)
all_results <- list()
for (cell_type in celltypes)
{
  print(cell_type)
  cell_type_proportions <- data[data$CellType == cell_type, ]
  data$Disease_Group <- factor(data$Disease_Group, levels=c("high","int","low"))
  data$Disease_Group <- relevel(data$Disease_Group, ref = "low")
    
  # Perform the mixed linear regression
  model <- lmer(Proportion ~ Disease_Group + (1|Subject)+Assay, data = cell_type_proportions)
  coef_summary <- summary(model)$coefficients
  coef_names <- rownames(coef_summary)
 
  t_values <- summary(model)$coefficients[, "t value"]
  df <- nrow(cell_type_proportions) - length(fixef(model))  # Degrees of freedom
  p_values <- 2 * pt(-abs(t_values), df)
    
  # Extract coefficient estimates and standard errors
  coef_estimates <- coef_summary[, "Estimate"]
  coef_standard_errors <- coef_summary[, "Std. Error"]
    
  # Calculate Z-scores
  z_scores <- coef_estimates / coef_standard_errors
  # Combine results into a data frame
  result_df <- data.frame(Coefficient = names(t_values),
                          t_value = t_values,
                          p_value = p_values,
                          z_scores= z_scores)
  # Print the results
  print(result_df)
  all_results[[cell_type]] <- result_df
}
combined_results <- do.call(rbind, all_results)
write.csv(combined_results,"BA9_proportions_results_LMM.csv")

# Plot LMM

In [None]:
import matplotlib.pyplot as plt
BA9=pd.read_csv("BA9_proportions_results_LMM.csv")
Y = BA9['log']
X = BA9['z_scores']
labels = BA9['Celltypes']
p_value = BA9["p_value"]
log_p_values = -np.log10(p_value)

# Set a threshold for significance
significance_threshold = -np.log10(0.05)

# Create a scatter plot with different colors and marker outlines
plt.figure(figsize=(8, 6)) 

# Loop through the data points and plot them individually
for i in range(len(X)):
    marker = 'o'  # Default marker
    color = plt.get_cmap("tab20")(i % 20) 
    
    #### highlight the significant celltype in red color
    if log_p_values[i] >= significance_threshold:
        x_pos = X[i] - 0.2 
        marker = 's'  # Use a square marker for significant points
        color = 'red'  
        plt.annotate(labels[i], (x_pos, log_p_values[i]), xytext=(-10, 0), textcoords='offset points', fontsize=12)

    plt.scatter(X[i], log_p_values[i], marker=marker, s=100, c=color, label=labels[i],edgecolor="black")

plt.rcParams.update({'font.size': 24, 'font.style': 'italic'})

# Set labels and title
plt.xlabel("Z scores")
plt.ylabel("-log10(padj)")
plt.xlim(-2.5, 2.5)

# Display the plot
plt.grid(True, linestyle='--', alpha=0.7, color='black')  # Add a grid for reference
plt.show()


# scCODA

In [None]:
import importlib
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import pickle as pkl
import matplotlib.pyplot as plt
from sccoda.util import comp_ana as mod
from sccoda.util import cell_composition_data as dat
from sccoda.util import data_visualization as viz
import sccoda.datasets as scd

adata = anndata.read_h5ad("path/dataset.h5ad")
BA9 = adata[adata.obs['Brain.Region'].isin(["Frontal Cx (BA9)"])]
sample_variable = "Subject"
# Create a DataFrame from the AnnData object

df = pd.DataFrame(BA9.obs)

# Group the data by sample and cluster and count the cells.

cluster_counts = df.groupby([sample_variable, 'Author_Annotation']).size().reset_index(name='cell_count')

# Pivot the table to have samples as rows and clusters as columns.

cluster_counts_pivot = cluster_counts.pivot(index=sample_variable, columns='Author_Annotation', values='cell_count').fillna(0)

# Print or save the cluster counts.



cluster_counts_pivot[sample_variable] = cluster_counts_pivot.index


# Running sccoda

model_all_neurons = mod.CompositionalAnalysis(BA9, formula= "Disease.Group", reference_cell_type="automatic")

sim_results = model_all_neurons.sample_hmc()

df_results = sim_results.credible_effects()

df_results.to_csv("path/sccoda_results_BA9.csv")