## Load Libraries

In [None]:
import importlib
import matplotlib.pyplot as plt
import pathlib
import seaborn as sns
import scanpy as sc
import pandas as pd
import anndata
import os 
import numpy as np 

from scipy import stats


local_path = os.getcwd()

## Load Data

In [None]:
# segmented (and mapped) MERSCOPE data

adata_spatial_file=  anndata.read_h5ad("/allen/programs/celltypes/workgroups/hct/SEA-AD/MERSCOPE/MTG_PAPER_FINAL/MTG_Spatial_2024_07_26.h5ad")

adata_spatial_new = adata_spatial_file[adata_spatial_file.obs.selected_cells]
genes = [g for g in adata_spatial_new.var.index if "Blank" not in g]

adata_spatial_new = adata_spatial_new[:,adata_spatial_new.var.index.isin(genes)].copy()

anndata_genes = adata_spatial_new.var.index.values.copy()

# load the average counts per donor in bulk RNASeq and MERSCOPE experiments
spatial_and_bulk_counts = pd.read_csv("../scANVI/input/MERSCOPE_vs_Bulk_RNASeq_comparison_dataframe_2024_01_31.csv", index_col=0)

# reorder this dataframe to match the anndata object. this is brittle (e.g. if we have multiple gene panels on the same donor) but it's easier
# than having to do a merge between an anndata object and a dataframe every time we want to do a comparison

ad_order = np.array([np.nonzero(g==spatial_and_bulk_counts.loc[:,"gene"].values)[0][0] for g in anndata_genes])

spatial_and_bulk_counts = spatial_and_bulk_counts.loc[ad_order,:]

avg_ad_bulk = spatial_and_bulk_counts.loc[:,"all_donors_bulk"].copy()


#mapping_filenames = np.array(list(adata_spatial_new.obs[['filename', 'Unique Donor ID']].value_counts().index))

## Plotting per Sample Metrics

In [None]:
spatial_and_bulk_counts

## Make figures showing:
- MERSCOPE correlation across replicates per donor
- correlation between total reads in MERSCOPE  and summed reads across segmented cells in MERSCOPE
- correlation between total reads in MERSCOPE and bulk RNASeq
- correlation between summed reads in MERSCOPE and bulk RNASeq

In [None]:
section_names = np.unique(np.array([string[:-3] for string in adata_spatial_new.obs["Unique Donor ID"].unique()]))

In [None]:
section_names

In [None]:

for chosen_donor in section_names:

    temp = adata_spatial_new[adata_spatial_new.obs.Donor.isin([chosen_donor])].copy()
    temp
    unique_id = temp.obs['Unique Donor ID'].unique()

    ### plt.rcParams["figure.figsize"] = (20, 4)


    data = []
    for uid in unique_id:
        data.append(
            np.array(temp.X[temp.obs['Unique Donor ID'].isin([uid]), ].sum(axis=0) ))

    if len(data)==0:
        continue
        
    donor_name = (".").join(unique_id[0].split(".")[:-1])
    # Replicate Correlation
    plt.figure(figsize=[30,6])
    plt.subplot(1, 4, 1)

    # if there's only one dataset, skip replicate correlation
    if len(data)>1:
        
        plt.scatter(data[0], data[1], s=2)
        plt.plot(data[0], data[0], c="red")
        corr_comp = np.corrcoef(data[0], data[1])[0, 1]
        plt.title("Replicates Corr:{:.2f}   {} vs {}".format(corr_comp, unique_id[0], unique_id[1]))
        plt.grid(False)

    # Correlation Against Total Transcript Before Segmentation
    plt.subplot(1, 4, 2)
    dat = spatial_and_bulk_counts.loc[:,donor_name]

    plt.scatter(data[0], dat, s=2)
    #plt.plot(data[0], data[0], c="red")
    corr_tot = np.corrcoef(dat, data[0])[0, 1]
    plt.title("Total Transcripts Corr:{:.2f}   {} vs {}".format(corr_tot, "Total", unique_id[0]))
    plt.grid(False)

    # Correlation of summed MERSCOPE cell data Against Avg Bulk
    plt.subplot(1, 4, 3)
    plt.scatter(avg_ad_bulk.to_numpy().squeeze(), data[0].squeeze(), s=2)
    corr_comp = np.corrcoef(avg_ad_bulk.to_numpy().squeeze(), data[0])[0, 1]
    plt.title("Bulk Corr:{:.2f}   {} vs average bulk RNAseq".format(corr_comp, unique_id[0]))
    plt.grid(False)

    # Correlation of total MERSCOPE Against Avg Bulk
    plt.subplot(1, 4, 4)
    plt.scatter(avg_ad_bulk.to_numpy().squeeze(), dat, s=2)
    corr_comp = np.corrcoef(avg_ad_bulk.to_numpy().squeeze(), dat)[0, 1]
    plt.title("Bulk Corr:{:.2f}   {} vs average bulk RNASeq".format(corr_comp, unique_id[0]))
    plt.grid(False)

    #plt.savefig("../plots_for_figures/donor_correlations/new_Donor_"+chosen_donor+"_correlations.svg")

In [None]:
replicates_corr = []
total_corr = []
bulk_corr = []
total_vs_bulk_corr = []
slope_replicates = []

for donor in adata_spatial_new.obs.Donor.unique():
    temp = adata_spatial_new[adata_spatial_new.obs.Donor.isin([donor])].copy()
    unique_id = temp.obs['Unique Donor ID'].unique()
    donor_name = (".").join(unique_id[0].split(".")[:-1])
    
    if len(unique_id) > 1:
        data = []
        
        for uid in unique_id:
            data.append(
                temp.X[temp.obs['Unique Donor ID'].isin([uid]), :].sum(axis=0) )
            
        for i_ in np.arange(1, len(data)):
            slope, intercept, r_value, p_value, std_err = stats.linregress(data[0], data[i_])
            slope_replicates.append(slope)
            
            replicates_corr.append(np.corrcoef(data[0], data[i_])[0, 1])
            if donor_name in spatial_and_bulk_counts.columns:
                dat = spatial_and_bulk_counts[donor_name]
                total_corr.append(np.corrcoef(dat, data[i_])[0, 1])
                total_vs_bulk_corr.append( np.corrcoef(dat, avg_ad_bulk.to_numpy().squeeze())[0, 1] )

            # Correlation Against Avg Bulk
            bulk_corr.append(np.corrcoef(avg_ad_bulk.to_numpy().squeeze(), data[i_])[0, 1])
                

In [None]:
len(bulk_corr)

In [None]:
adata_spatial_new.obs.columns

# note: total sections analyzed here (42) is neither the number of donors used nor the number of sections- its the number of sections from donors where we also have bulk data

In [None]:
# n sections: 
raw_total = len(adata_spatial_new.obs['Unique Donor ID'].unique())
print("Number of Sections Considered: {}".format(raw_total))
layer_cells_total = len(adata_spatial_new.obs.loc[adata_spatial_new.obs.layer_annotation !="",:]["Unique Donor ID"].unique())
print("Number of Sections Used for proportion analysis: {}".format(layer_cells_total))

selected_cells_total = len(adata_spatial_new.obs.loc[adata_spatial_new.obs.selected_cells,:]["Unique Donor ID"].unique())
print("Number of Sections with selected cells: {}".format(selected_cells_total))

# Number of Sections Considered: 69


In [None]:
plt.figure(figsize = [18,3])
plt.subplot(1, 5, 1)
plt.title("Correlation across replicates")
plt.hist(replicates_corr, 20)
plt.axis([0.5, 1, 0, 13])
plt.xlabel("correlation")
plt.ylabel("sections")
plt.grid(False)

plt.subplot(1, 5, 2)
plt.title("Slope across donor replicates")
plt.hist(slope_replicates, 20)
plt.axis([0.5, 4.0, 0, 15])
plt.xlabel("slope")
plt.ylabel("sections")
plt.grid(False)

plt.subplot(1, 5, 3)
plt.title("Cell Transcripts vs Total")
plt.hist(total_corr, 20)
plt.axis([0.5, 1, 0, 5])
plt.xlabel("correlation")
plt.ylabel("sections")
plt.grid(False)


plt.subplot(1, 5, 4)
plt.title("Cell Transcripts vs Bulk")
plt.hist(bulk_corr, 20)
plt.axis([0.5, 1, 0, 8])
plt.xlabel("correlation")
plt.ylabel("sections")
plt.grid(False)

plt.subplot(1, 5, 5)
plt.title("Total Transcripts vs Bulk")
plt.hist(total_vs_bulk_corr, 20)
plt.axis([0, 1, 0, 5])
plt.xlabel("correlation")
plt.ylabel("sections")
plt.grid(False)
plt.savefig("../plots_for_figures/new_Histogram_MERSCOPE_correlations.svg")
#plt.savefig("../plots_for_figures/new_Histogram_MERSCOPE_correlations.pdf")

In [None]:

print("Correlation across replicates")
print(np.mean(replicates_corr))

print("Slope across donor replicates")
print(np.mean(slope_replicates))

print("Cell Transcripts vs Total")
print(np.mean(total_corr))



print("Cell Transcripts vs Bulk")
print(np.mean(bulk_corr))



print("Total Transcripts vs Bulk")
print(np.mean(total_vs_bulk_corr))