In [1]:
import pysam
import pandas as pd 
from Bio.Seq import Seq

In [35]:
def reverse_complement(seq):
    """Reverse complementary sequence

    :param original seq
    :return Reverse complementary sequence
    """
    return str(Seq(seq).reverse_complement())

In [76]:
"""
5 prime featureCounts assigned reads
3 prime featureCounts assigned reads
5 prime reads in 3 prime CB(cell barcode)
3 prime reads in 3 prime CB
5 prime reads in 3 prime CB + UMI
5 prime reads in 3 prime CB + UMI + gene
"""
prime5_assign_reads = 0
prime3_assign_reads = 0
prime5_in_prime3_cb = 0
prime3_in_prime3_cb = 0
prime5_in_prime3_cb_umi = 0
prime5_in_prime3_cb_umi_gene = 0

In [77]:
bam3 = "/SGRNJ06/randd/USER/cjj/celedev/rna/20231018/3/Mus_0928spleen2_N3lib/02.featureCounts/Mus_0928spleen2_N3lib_aligned_posSorted_addTag.bam"
bam5 = "/SGRNJ06/randd/USER/cjj/celedev/rna/20231018/5/Mus_0928spleen2_20_N5lib/02.featureCounts/Mus_0928spleen2_20_N5lib_aligned_posSorted_addTag.bam"
prime3_cb = "/SGRNJ06/randd/USER/cjj/celedev/rna/20231018/3/Mus_0928spleen2_N3lib/03.count/Mus_0928spleen2_N3lib_filtered_feature_bc_matrix/barcodes.tsv"

In [78]:
prime3_cb_set = pd.read_csv(prime3_cb, header=None)
prime3_cb_set = prime3_cb_set.rename(columns={0: "bc"})
prime3_cb_set = set(prime3_cb_set.bc)

In [79]:
prime3_cb_umi_set = set()
prime3_cb_umi_gene_set = set()

In [80]:
        inputFile3 = pysam.AlignmentFile(bam3, "rb")
        inputFile5 = pysam.AlignmentFile(bam5, "rb")
        
        for read in inputFile3:
            stat = read.get_tag("XS")
            if stat == "Assigned":
                prime3_assign_reads += 1
                cb = read.get_tag("CB")
                umi = read.get_tag("UB")
                try:
                    gene_id = read.get_tag("XT")
                except KeyError:
                    gene_id = None
                if cb in prime3_cb_set:
                    prime3_in_prime3_cb += 1
                    prime3_cb_set.add( reverse_complement(cb) )
                    prime3_cb_umi_set.add( (reverse_complement(cb), reverse_complement(umi)) )
                    prime3_cb_umi_gene_set.add( (reverse_complement(cb), reverse_complement(umi), gene_id) )
        
        for read in inputFile5:
            stat = read.get_tag("XS")
            if stat == "Assigned":
                prime5_assign_reads += 1
                cb = read.get_tag("CB")
                umi = read.get_tag("UB")
                try:
                    gene_id = read.get_tag("XT")
                except KeyError:
                    gene_id = None
                if cb in prime3_cb_set:
                    prime5_in_prime3_cb += 1
                if (cb, umi) in prime3_cb_umi_set:
                    prime5_in_prime3_cb_umi += 1
                if (cb, umi, gene_id) in prime3_cb_umi_gene_set:
                    prime5_in_prime3_cb_umi_gene += 1

In [81]:
prime5_assign_reads

22708360

In [82]:
prime3_assign_reads

16336623

In [83]:
prime5_in_prime3_cb

9570556

In [84]:
prime3_in_prime3_cb

7566198

In [85]:
prime5_in_prime3_cb_umi

2326503

In [86]:
prime5_in_prime3_cb_umi_gene

1539472

In [87]:
format(prime5_in_prime3_cb_umi_gene,',')

'1,539,472'

In [88]:
out_file = open("/SGRNJ06/randd/USER/cjj/celedev/rna/20231018/read_count/Mus_0928spleen2_20_N5lib.txt", 'w')
out_file.write(f"5 prime featureCounts assigned reads : {format(prime5_assign_reads, ',')}\n")
out_file.write(f"3 prime featureCounts assigned reads : {format(prime3_assign_reads, ',')}\n")
out_file.write(f"5 prime reads in 3 prime CB : {format(prime5_in_prime3_cb, ',')}\n")
out_file.write(f"3 prime reads in 3 prime CB : {format(prime3_in_prime3_cb, ',')}\n")
out_file.write(f"5 prime reads in 3 prime CB + UMI : {format(prime5_in_prime3_cb_umi, ',')}\n")
out_file.write(f"5 prime reads in 3 prime CB + UMI + gene : {format(prime5_in_prime3_cb_umi_gene, ',')}\n")
out_file.close()

In [None]:
# barcode 统计

In [40]:
prime3_cb = "/SGRNJ06/randd/USER/cjj/celedev/rna/20230913/3/HM_PBMC_0831_2_N3lib/03.count/HM_PBMC_0831_2_N3lib_filtered_feature_bc_matrix/barcodes.tsv"

In [41]:
prime5_cb = "/SGRNJ06/randd/USER/cjj/celedev/rna/20230925/5/HM_PBMC_0831_20_20_2_N5lib/03.count/HM_PBMC_0831_20_20_2_N5lib_filtered_feature_bc_matrix/barcodes.tsv"

In [42]:
prime3_cb_set = pd.read_csv(prime3_cb, header=None)
prime3_cb_set = prime3_cb_set.rename(columns={0: "bc"})
prime3_cb_set = set(prime3_cb_set.bc)

prime5_cb_set = pd.read_csv(prime5_cb, header=None)
prime5_cb_set = prime5_cb_set.rename(columns={0: "bc"})
prime5_cb_set['bc'] = prime5_cb_set['bc'].apply(lambda x: reverse_complement(x))
prime5_cb_set = set(prime5_cb_set.bc)

In [43]:
len(prime3_cb_set)

3304

In [44]:
len(prime5_cb_set)

2177

In [45]:
len(prime3_cb_set & prime5_cb_set)

2170