In [2]:
# Comparing scRNA seq heart atlas to spatial data set
# Probably want to output a table with sample, proportion, lineage

In [1]:
import scanpy as sc
import pandas as pd


In [2]:
# Read in Global_lognormalised.h5ad file
adata = sc.read_h5ad('/scratch/aoill/projects/heart_transplant/scrna_atlas/Global_lognormalised.h5ad')

In [4]:
# Look at data
print(adata)

AnnData object with n_obs × n_vars = 704296 × 32732
    obs: 'sangerID', 'donor', 'donor_type', 'region', 'age', 'gender', 'facility', 'cell_or_nuclei', 'modality', 'kit_10x', 'flushed', 'cell_type', 'cell_state', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'total_counts_ribo', 'pct_counts_ribo', 'scrublet_score'
    var: 'gene_name_scRNA-0-original', 'gene_name_snRNA-1-original', 'gene_name_multiome-2-original', 'gene_id'
    uns: 'age_colors', 'cell_or_nuclei_colors', 'cell_state_colors', 'cell_type_colors', 'donor_colors', 'donor_type_colors', 'facility_colors', 'flushed_colors', 'gender_colors', 'kit_10x_colors', 'leiden', 'log1p', 'modality_colors', 'neighbors', 'original_or_new_colors', 'region_colors', 'region_finest_colors', 'scANVI_predictions_colors', 'umap'
    obsm: 'X_umap'
    obsp: 'connectivities', 'distances'


In [3]:
print(adata.obs.head())

                                         sangerID donor donor_type region  \
barcode                                                                     
HCAHeart7606896_GATGAGGCACGGCTAC  HCAHeart7606896    D1        DBD     AX   
HCAHeart7606896_CGCTTCACATTTGCCC  HCAHeart7606896    D1        DBD     AX   
HCAHeart7606896_GTTAAGCAGAGACTAT  HCAHeart7606896    D1        DBD     AX   
HCAHeart7606896_TCGCGTTGTAAGAGGA  HCAHeart7606896    D1        DBD     AX   
HCAHeart7606896_GCTGCGAGTGTTGGGA  HCAHeart7606896    D1        DBD     AX   

                                    age  gender facility cell_or_nuclei  \
barcode                                                                   
HCAHeart7606896_GATGAGGCACGGCTAC  50-55  Female   Sanger           Cell   
HCAHeart7606896_CGCTTCACATTTGCCC  50-55  Female   Sanger           Cell   
HCAHeart7606896_GTTAAGCAGAGACTAT  50-55  Female   Sanger           Cell   
HCAHeart7606896_TCGCGTTGTAAGAGGA  50-55  Female   Sanger           Cell   
HCAHeart76

In [4]:
print(adata.var.head())

              gene_name_scRNA-0-original gene_name_snRNA-1-original  \
gene_name-new                                                         
MIR1302-2HG                  MIR1302-2HG                MIR1302-2HG   
FAM138A                          FAM138A                    FAM138A   
OR4F5                              OR4F5                      OR4F5   
AL627309.1                    AL627309.1                 AL627309.1   
AL627309.3                    AL627309.3                 AL627309.3   

              gene_name_multiome-2-original          gene_id  
gene_name-new                                                 
MIR1302-2HG                     MIR1302-2HG  ENSG00000243485  
FAM138A                             FAM138A  ENSG00000237613  
OR4F5                                 OR4F5  ENSG00000186092  
AL627309.1                       AL627309.1  ENSG00000238009  
AL627309.3                       AL627309.3  ENSG00000239945  


In [5]:
# Is there only scRNA in this dataset?
unique_modalities = adata.obs['modality'].unique()
print(unique_modalities)
# 'Multiome-RNA', 'scRNA', 'snRNA'
# not sure if that matters
# no spatial so for now I'll just us all of the data

['scRNA', 'snRNA', 'Multiome-RNA']
Categories (3, object): ['Multiome-RNA', 'scRNA', 'snRNA']


In [6]:
unique_donors = adata.obs['donor'].unique()
print(unique_donors)
# there are 22 donors

['D1', 'D3', 'D4', 'D5', 'D6', ..., 'AV10', 'AV14', 'AV3', 'AV13', 'AH2']
Length: 22
Categories (22, object): ['A61', 'AH1', 'AH2', 'AV3', ..., 'H4', 'H5', 'H6', 'H7']


In [7]:
# What are the different regions of the heart sampled?
# Do we know in our data set and do we want to match where avaiable?
unique_regions = adata.obs['region'].unique()
print(unique_regions)
# 'SAN', 'AVN', 'RA', 'LA', 'RV', 'LV', 'SP', 'AX'

['AX', 'LV', 'RV', 'LA', 'SP', 'RA', 'SAN', 'AVN']
Categories (8, object): ['SAN', 'AVN', 'RA', 'LA', 'RV', 'LV', 'SP', 'AX']


In [8]:
# There is a column called cell_type in the metadata. I want 
# to see what are the cell types in this data set
unique_cell_types = adata.obs['cell_type'].unique()
print(unique_cell_types)

['Endothelial cell', 'Mural cell', 'Myeloid', 'Fibroblast', 'Lymphoid', ..., 'Lymphatic Endothelial cell', 'Mesothelial cell', 'Atrial Cardiomyocyte', 'Mast cell', 'Adipocyte']
Length: 12
Categories (12, object): ['Atrial Cardiomyocyte', 'Ventricular Cardiomyocyte', 'Fibroblast', 'Endothelial cell', ..., 'Adipocyte', 'Myeloid', 'Lymphoid', 'Mast cell']


In [10]:
#for cell_type in unique_cell_types:
#    print(cell_type)

#Endothelial cell - Endothelial 
#Mural cell - Mesenchymal
#Myeloid - Immune
#Fibroblast - Mesenchymal
#Lymphoid - Immune
#Neural cell - Neural
#Ventricular Cardiomyocyte - Mesenchymal
#Lymphatic Endothelial cell - Endothelial
#Mesothelial cell - Mesenchymal
#Atrial Cardiomyocyte - Mesenchymal
#Mast cell - Immune
#Adipocyte - Mesenchymal

In [9]:
# Add a column in the meta data called lineage
cell_type_to_lineage = {
    'Endothelial cell': 'Endothelial',
    'Mural cell': 'Stromal',
    'Myeloid': 'Immune',
    'Fibroblast': 'Stromal',
    'Lymphoid': 'Immune',
    'Neural cell': 'Neural',
    'Ventricular Cardiomyocyte': 'Stromal',
    'Lymphatic Endothelial cell': 'Endothelial',
    'Mesothelial cell': 'Stromal',
    'Atrial Cardiomyocyte': 'Stromal',
    'Mast cell': 'Immune',
    'Adipocyte': 'Stromal',
}

# Add a new column 'lineage' based on the 'cell_type' column
adata.obs['lineage'] = adata.obs['cell_type'].map(cell_type_to_lineage)

# Verify the result
print(adata.obs[['cell_type', 'lineage']].head())

                                         cell_type      lineage
barcode                                                        
HCAHeart7606896_GATGAGGCACGGCTAC  Endothelial cell  Endothelial
HCAHeart7606896_CGCTTCACATTTGCCC        Mural cell      Stromal
HCAHeart7606896_GTTAAGCAGAGACTAT  Endothelial cell  Endothelial
HCAHeart7606896_TCGCGTTGTAAGAGGA        Mural cell      Stromal
HCAHeart7606896_GCTGCGAGTGTTGGGA  Endothelial cell  Endothelial


In [10]:
# For each donor, get the lineage level proportions
# Group by donor and lineage to count cells in each category
counts = adata.obs.groupby(['donor', 'lineage']).size().reset_index(name='count')

# Calculate total cells for each donor
total_counts = adata.obs.groupby('donor').size().reset_index(name='total_count')

# Merge the lineage counts with total donor counts
proportions = counts.merge(total_counts, on='donor')

# Calculate the proportion
proportions['proportion'] = proportions['count'] / proportions['total_count']

# Select relevant columns and save to a new DataFrame
result = proportions[['donor', 'lineage', 'proportion']]

# Rename columns for clarity
result.columns = ['Donor ID', 'Lineage', 'Proportion']

# Display the resulting DataFrame
print(result)


   Donor ID      Lineage  Proportion
0       A61  Endothelial    0.086023
1       A61       Immune    0.106192
2       A61       Neural    0.011963
3       A61      Stromal    0.795822
4       AH1  Endothelial    0.119064
..      ...          ...         ...
83       H6      Stromal    0.889906
84       H7  Endothelial    0.040788
85       H7       Immune    0.059919
86       H7       Neural    0.010639
87       H7      Stromal    0.888654

[88 rows x 3 columns]


In [11]:
# Save the DataFrame to a CSV file
result.to_csv('/scratch/aoill/projects/heart_transplant/scrna_atlas/donor_lineage_proportions.csv', index=False)

In [12]:
# Extract metadata from the AnnData object
metadata = adata.obs

# Save metadata to a CSV file
metadata.to_csv('/scratch/aoill/projects/heart_transplant/scrna_atlas/scrna_atlas_metadata.csv', index=True)  # Include index to preserve cell IDs