In [27]:
import pysam 
import scanpy as sc
import numpy as np 
import pandas as pd
import seaborn as sns

In [3]:
PATH_BAM = '/node200data/18parkky/datasets/data/public/BAM/Kinker_et_al/Pool1_18.possorted_genome.bam'

bamfile = pysam.AlignmentFile(PATH_BAM, 'rb')

## Create CB→Primary read count dictionary for each BAM

In [6]:
dict_CB_to_PrimaryReadCount = dict() 
for read in bamfile.fetch():
    if read.is_supplementary==True or read.is_secondary==True: continue 
    
    try:
        CB = read.get_tag('CB')
    except KeyError: continue 
    
    try:
        dict_CB_to_PrimaryReadCount[CB] += 1
    except KeyError:
        dict_CB_to_PrimaryReadCount[CB] = 1

In [21]:
PrimaryReadCount_distribution = list(dict_CB_to_PrimaryReadCount.values())
PrimaryReadCount_distribution_sampled = pd.Series(PrimaryReadCount_distribution).sample(10000)

# sns.histplot(x=PrimaryReadCount_distribution_sampled)

In [25]:
np.mean(PrimaryReadCount_distribution), np.std(PrimaryReadCount_distribution)

(291.8943223045676, 3588.1917506572795)

## Estimate coverage

In [None]:
adata = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Kinker_et_al/CPM_data.metalabeled.h5ad')
sc.pp.calculate_qc_metrics(adata, inplace=True)

In [47]:
for k,v in dict_CB_to_PrimaryReadCount.items():
    print(k)
    break

ATAAGAGCACTACAGT-1


In [51]:
dict_CB_to_PrimaryReadCount2 = dict()
for k, v in dict_CB_to_PrimaryReadCount.items():
    dict_CB_to_PrimaryReadCount2[k.split('-')[0]] = v

In [54]:
adata_obs_pool18 = adata.obs[(adata.obs['PoolID']=='18')].copy()
adata_obs_pool18['str_CB'] = [ f"{CB.split('-')[0]}" for CB in adata_obs_pool18['CB']  ]
adata_obs_pool18[(adata_obs_pool18['str_CB'].isin(dict_CB_to_PrimaryReadCount2.keys()))]

Unnamed: 0,CB,CellLine,PoolID,CancerType,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,pct_counts_in_top_100_genes,pct_counts_in_top_200_genes,pct_counts_in_top_500_genes,str_CB
0,AAACCTGAGACATAAC-1-18,NCIH2126_LUNG,18,Lung Cancer,4318,8.370779,990492.895983,13.805959,32.354608,45.668343,57.257208,70.751576,AAACCTGAGACATAAC
1,AACGTTGTCACCCGAG-1-18,NCIH2126_LUNG,18,Lung Cancer,5200,8.556606,991547.667782,13.807023,29.400622,42.359627,53.893044,68.073635,AACGTTGTCACCCGAG
2,AACTGGTAGACACGAC-1-18,NCIH2126_LUNG,18,Lung Cancer,4004,8.295299,990979.660870,13.806450,31.149621,43.652357,55.231719,69.585576,AACTGGTAGACACGAC
3,AACTGGTAGGGCTTGA-1-18,NCIH2126_LUNG,18,Lung Cancer,4295,8.365440,992040.188205,13.807520,36.316953,47.853220,59.093503,72.016975,AACTGGTAGGGCTTGA
4,AACTGGTAGTACTTGC-1-18,NCIH2126_LUNG,18,Lung Cancer,4842,8.485290,989435.176133,13.804891,35.440173,46.069994,56.116530,69.293045,AACTGGTAGTACTTGC
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4672,TTGCGTCGTACCGAGA-2-18,KPNSI9S_AUTONOMIC_GANGLIA,18,Neuroblastoma,3005,8.008366,991380.986326,13.806855,39.299255,55.249052,65.008498,76.696300,TTGCGTCGTACCGAGA
4673,TTTATGCGTTCGCGAC-2-18,KPNSI9S_AUTONOMIC_GANGLIA,18,Neuroblastoma,2721,7.909122,992743.210231,13.808228,39.777086,56.618658,66.555692,77.701382,TTTATGCGTTCGCGAC
4674,TTTGCGCCACTAGTAC-2-18,KPNSI9S_AUTONOMIC_GANGLIA,18,Neuroblastoma,3544,8.173293,992555.831266,13.808040,35.994898,51.765306,62.219388,74.071429,TTTGCGCCACTAGTAC
4675,TTTGCGCGTCAACATC-2-18,KPNSI9S_AUTONOMIC_GANGLIA,18,Neuroblastoma,3562,8.178358,990567.216392,13.806034,32.673267,46.421139,56.776187,69.414139,TTTGCGCGTCAACATC


In [58]:
adata_j = sc.read_h5ad('/node200data/18parkky/datasets/data/public/processed_data/Joanito_et_al/2_cell_type_labeled_h5ad/Joanito.preprocessed.NanoMnT.ManualAnnot.Epi.h5ad')
adata_j.obs

Unnamed: 0,doublet_score,predicted_doublet,SampleID,PatientID,BiopsySite,n_genes_by_counts,log1p_n_genes_by_counts,total_counts,log1p_total_counts,pct_counts_in_top_50_genes,...,Author_CellType_lvl_1,Author_CellType_lvl_2,n_genes,leiden,AvgSTRDiff,StdSTRDiff,NumSTRLoci,MSI,BiopsySite2,Manual_CellType_lvl_1
AAACGGGTCGAGGTAG-1,0.054222,False,XHC078,CRC2783,Core,5290,8.573763,25282.0,10.137888,28.506447,...,Epithelial,,5290,6,-2.084337,2.612319,83,MSI-H,Tumor,Tumor epithelial
AAAGCAAAGATCACGG-1,0.031230,False,XHC078,CRC2783,Core,3378,8.125335,11074.0,9.312446,38.468485,...,Epithelial,,3378,6,-1.884615,2.334143,52,MSI-H,Tumor,Tumor epithelial
AACCATGCAGCCTGTG-1,0.034114,False,XHC078,CRC2783,Core,4212,8.345930,11043.0,9.309643,26.070814,...,Epithelial,,4212,6,-1.968085,2.528359,94,MSI-H,Tumor,Tumor epithelial
AACCATGGTCAATACC-1,0.026209,False,XHC078,CRC2783,Core,4628,8.440096,18315.0,9.815531,32.825553,...,Epithelial,,4628,6,-1.618421,1.953183,76,MSI-H,Tumor,Tumor epithelial
AACCGCGAGGCGCTCT-1,0.034114,False,XHC078,CRC2783,Core,5206,8.557759,40752.0,10.615285,45.062819,...,Epithelial,,5206,6,-1.963415,2.126747,82,MSI-H,Tumor,Tumor epithelial
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TGTGTTTCAGCTGCAC-1,0.048679,False,EXT115,SC041,Adjacent normal tissue,2758,7.922624,8870.0,9.090543,34.385569,...,Epithelial,,2758,3,0.255814,1.221761,43,MSS,Normal,Normal epithelial
TTCTCCTGTTGTTTGG-1,0.052786,False,EXT115,SC041,Adjacent normal tissue,5423,8.598589,25402.0,10.142623,29.690576,...,Epithelial,,5423,3,0.091954,1.228326,87,MSS,Normal,Normal epithelial
TTGACTTAGGAATTAC-1,0.074906,False,EXT115,SC041,Adjacent normal tissue,3597,8.188133,20543.0,9.930325,52.007983,...,Epithelial,,3597,3,-0.250000,1.944544,64,MSS,Normal,Normal epithelial
TTGGCAAAGGACACCA-1,0.057364,False,EXT115,SC041,Adjacent normal tissue,4368,8.382289,17133.0,9.748820,31.687387,...,Epithelial,,4368,3,-0.772727,2.472855,66,MSS,Normal,Normal epithelial


In [63]:
np.mean(adata.obs['total_counts'])/np.mean(adata_j.obs['total_counts'])

39.02190609962194