In [1]:
import pandas as pd
import numpy as np
import pysam
import copy
import os
import subprocess
from Bio.Seq import Seq
from celescope.tools import utils

In [190]:
class Auto():
    """
    threshold = top {percentile}% cell count / coef
    count is usually UMI count.
    >>> array = [50] * 100 + [30] * 100 + [10] * 100 + [4] * 100
    >>> Auto(array, coef=10).run()
    5
    >>> Auto(array, percentile=70, coef=3).run()
    10
    >>> Auto(array, percentile=50, coef=10, expected_cell_num=100).run()
    5
    >>> Auto([1, 2, 20, 30, 40], expected_cell_num=4, percentile=50, coef=10).run()
    2
    """
    def __init__(self, array, percentile=99, coef=3, expected_cell_num=None, **kwargs):
        self.array = [x for x in array if x > 0 ]
        self.percentile = percentile
        self.coef = int(coef)
        self.expected_cell_num = expected_cell_num
        self.kwargs = kwargs
    
    def run(self):
        array = self.array
        if not array:
            return 1

        if not self.expected_cell_num:
            expected_cell_num = len(array)
        else:
            expected_cell_num = self.expected_cell_num
            if expected_cell_num > len(array):
                print('Warning: expected_cell_num > len(array)')
                expected_cell_num = len(array)
                      
        sorted_counts = sorted(array, reverse=True)
        count_cell_percentile = np.percentile(sorted_counts[:expected_cell_num], self.percentile)
        threshold = int(count_cell_percentile / self.coef)

        return threshold

In [191]:
def reversed_compl(seq):
    return str(Seq(seq).reverse_complement())

In [192]:
def target_cell_calling(df_UMI_sum, expected_target_cell_num=3000, target_barcodes=None, weight=6, coef=5, 
    percentile=80, umi_col='umis'):
    """
    Args:
        df_UMI_sum: A dataframe with columns highest umi's contig and UMI.
    
    Returns:
        umi_threhold: int
        target_cell_barcodes: list
    >>> df_UMI_sum = pd.DataFrame({"contig_id": ["A", "B", "C", "D", "E"], "UMI": [1, 2, 1, 30, 40]})
    >>> umi_threshold, target_cell_barcodes = target_cell_calling(df_UMI_sum, expected_target_cell_num=5, percentile=80, coef=5, target_barcodes=["A", "C"])
    >>> umi_threshold == 3
    True
    >>> target_cell_barcodes == {'A', 'C', 'D', 'E'}
    True
    """
    if target_barcodes != None:
        target_barcodes = {reversed_compl(i) for i in target_barcodes}
    umi_threshold = Auto(list(df_UMI_sum[umi_col]), expected_cell_num=expected_target_cell_num, coef=coef, percentile=percentile).run()

    # avoid change the original dataframe
    df_temp = df_UMI_sum.copy()
    if target_barcodes:
        df_temp[umi_col] = df_temp.apply(
            lambda row:  row[umi_col] * weight if row['barcode'] in target_barcodes else row[umi_col], axis=1)
             
    target_contigs = set(df_temp.loc[df_temp[umi_col] >= umi_threshold].contig_id)

    return target_contigs

In [251]:
        df = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220517merTRflilter/sc-1/03.assemble/assemble/sc-1_contig.csv', sep='\t', header=None)
        df.columns = ['barcode', 'is_cell', 'contig_id', 'high_confidence', 'length', 'chain', 'v_gene', 'd_gene', 'j_gene', 'c_gene', 'full_length', 'productive', 'cdr3', 'cdr3_nt', 'reads', 'umis']
        df['d_gene'] = df['d_gene'].apply(lambda x: x.split('(')[0] if not x == '*' else 'None')
        df['c_gene'] = df['c_gene'].apply(lambda x: x.split('(')[0] if not x == '*' else 'None')
        df['cdr3'] = df['cdr3_nt'].apply(lambda x: 'None' if "*" in str(Seq(x).translate()) or not len(x)%3==0 else str(Seq(x).translate()))
        df['productive'] = df['cdr3'].apply(lambda x: False if x=='None' else True)



In [345]:
df.sort_values(by='umis', ascending=False, inplace=True)
df_chain_heavy = df[df['chain']=='IGH']
df_chain_light = df[(df['chain']=='IGK') | (df['chain']=='IGL')]
df_chain_heavy = df_chain_heavy.drop_duplicates(['barcode'])
df_chain_light = df_chain_light.drop_duplicates(['barcode'])
df_for_clono = pd.concat([df_chain_heavy, df_chain_light], ignore_index=True)
trust_report = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220517merTRflilter/sc-1/03.assemble/assemble/trust_filter_report.out', sep='\t')
correct_cdr3 = set(df_for_clono.cdr3).intersection(set(trust_report.CDR3aa))
correct_cdr3 = [i for i in correct_cdr3 if i.startswith('C')]
correct_cdr3 = [i for i in correct_cdr3 if len(i)>=5]
correct_cdr3 = [i for i in correct_cdr3 if 'UAG' or 'UAA' or 'UGA' not in i]
df_for_clono = df_for_clono[df_for_clono['cdr3'].isin(correct_cdr3)]

In [346]:
df_chain_heavy = df_for_clono[df_for_clono['chain']=='IGH']
df_chain_light = df_for_clono[(df_for_clono['chain']=='IGK') | (df_for_clono['chain']=='IGL')]

In [347]:
target_barcodes = utils.read_one_col('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/barcodelist.csv')[0]

In [348]:
#target_barcodes

In [349]:
        filtered_congtigs_id = set()
        for _df in [df_chain_heavy, df_chain_light]:
            target_contigs = target_cell_calling(
            _df, 
            #expected_target_cell_num=3000, 
            #target_barcodes=None,
            expected_target_cell_num = len(target_barcodes),
            target_barcodes = target_barcodes,
            weight=6, coef=5, 
            percentile=85,
            )
            filtered_congtigs_id = filtered_congtigs_id | target_contigs       
        
        df_for_clono = df_for_clono[df_for_clono.contig_id.isin(filtered_congtigs_id)]

In [350]:
df_for_clono

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis
0,TGATACGTTCAGGAGGTTGTTCCA,True,TGATACGTTCAGGAGGTTGTTCCA_899,True,517,IGH,IGHV3-20*01,IGHD1-20*01,IGHJ4*02,IGHG2B*03,True,True,CARGITGIDYW,TGTGCGAGGGGTATAACTGGAATTGACTACTGG,242751,105937
1,TAGACGGAGTCGCTATTCGAGCGT,True,TAGACGGAGTCGCTATTCGAGCGT_1286,True,605,IGH,IGHV3-23*04,IGHD7-27*01,IGHJ2*01,IGHG1*01,True,True,CAKDRNWGYEYFDLW,TGTGCGAAAGATCGAAACTGGGGATACGAGTACTTCGATCTCTGG,146639,79062
2,GTTGTCGGTCTCGGTTTGCATAGT,True,GTTGTCGGTCTCGGTTTGCATAGT_525,True,804,IGH,IGHV3-20*01,IGHD3-9*01,IGHJ4*02,IGHG2B*03,True,True,CARDLTLFDYW,TGTGCGAGAGATCTAACTCTTTTTGACTACTGG,138706,71297
3,TTCTGTGTGGTCGTGTCGATGTTT,True,TTCTGTGTGGTCGTGTCGATGTTT_5973,True,522,IGH,IGHV4-39*01,IGHD1-7*01,IGHJ2*01,IGHA*09,True,True,CASLYNWNYDWFDPWGLW,TGTGCGAGCCTGTATAACTGGAACTACGACTGGTTCGACCCCTGGG...,74544,45872
4,GTGTGTCGGCTAACTCTCGAAGTG,True,GTGTGTCGGCTAACTCTCGAAGTG_2191,True,536,IGH,IGHV3-48*03,IGHD3-22*01,IGHJ4*02,IGHA*09,True,True,CAREADWLVHYYFDYW,TGTGCGAGAGAGGCGGACTGGCTGGTTCATTACTACTTTGACTACTGG,55476,39029
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10428,TGACAGACGCACTGTCTGTCTATC,True,TGACAGACGCACTGTCTGTCTATC_2700,True,516,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,207,153
10429,TCCGTCTTTCAGGAGGTGCGATCT,True,TCCGTCTTTCAGGAGGTGCGATCT_3695,True,503,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,189,153
10430,TCGAAGTGTCTCACGGTATGTGGC,True,TCGAAGTGTCTCACGGTATGTGGC_2885,True,522,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,229,153
10431,TCCGTCTTGAGGATGGGTGTGTCG,True,TCCGTCTTGAGGATGGGTGTGTCG_2513,True,523,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,172,153


In [351]:
get_vj_annot(df_for_clono, ['IGH', 'IGL', 'IGK'], ['IGH_IGL', 'IGH_IGK'])

[{'name': 'Cells With Productive V-J Spanning Pair',
  'value': 4048,
  'total': 4612},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGL) Pair',
  'value': 0,
  'total': 4612,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGK) Pair',
  'value': 4048,
  'total': 4612,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start c

In [22]:
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': fl_pro_pair_df.shape[0],
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['productive']==True)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [2]:
#meta = pd.read_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/20220530TR_SR/fltest/04.summarize/fltest_assign.csv')
#meta =pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220517merTRflilter/mappingmanual/220417014_meta.csv')
meta =pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220517merTRflilter/mappingmanual/220417016_meta.csv')

In [3]:
meta

Unnamed: 0.1,Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,sample,percent.mt,RNA_snn_res.0.8,seurat_clusters,celltype,Class
0,220417016_AAACATCGAAACATCGGAGCTGAA,220417016,7928,2606,220417016,0,7,7,GCB,T/BCR
1,220417016_AAACATCGAACAACCACCGAAGTA,220417016,20544,2098,220417016,0,10,10,PlasmaCells,T/BCR
2,220417016_AAACATCGAACAACCAGCTCGGTA,220417016,30676,2149,220417016,0,4,4,PlasmaCells,T/BCR
3,220417016_AAACATCGAACCGAGAACAGCAGA,220417016,10727,1183,220417016,0,4,4,PlasmaCells,T/BCR
4,220417016_AAACATCGAACCGAGATTCACGCA,220417016,2893,1175,220417016,0,2,2,NaiveB,T/BCR
...,...,...,...,...,...,...,...,...,...,...
16350,220417016_TTCACGCATGGCTTCAAAGAGATC,220417016,16833,4074,220417016,0,12,12,TCells,T/BCR
16351,220417016_TTCACGCATGGCTTCAAGCACCTC,220417016,3621,1379,220417016,0,3,3,GCB,T/BCR
16352,220417016_TTCACGCATGGCTTCAGCTCGGTA,220417016,2161,926,220417016,0,2,2,NaiveB,
16353,220417016_TTCACGCATGGCTTCATAGGATGA,220417016,1967,952,220417016,0,2,2,NaiveB,T/BCR


In [4]:
meta.rename(columns={'Unnamed: 0': 'barcode'}, inplace=True)

In [5]:
meta.groupby('celltype').count()

Unnamed: 0_level_0,barcode,orig.ident,nCount_RNA,nFeature_RNA,sample,percent.mt,RNA_snn_res.0.8,seurat_clusters,Class
celltype,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
GCB,5976,5976,5976,5976,5976,5976,5976,5976,5803
NaiveB,1679,1679,1679,1679,1679,1679,1679,1679,1511
PlasmaCells,2108,2108,2108,2108,2108,2108,2108,2108,2087
ProliferatingBCells,5679,5679,5679,5679,5679,5679,5679,5679,2095
TCells,913,913,913,913,913,913,913,913,810


In [6]:
tar = {'GCB','NaiveB','PlasmaCells','ProliferatingBCells',}
#tar = {'B_cell'}

In [7]:
meta = meta[meta['celltype'].isin(tar)]

In [8]:
meta = meta[['barcode']]

In [9]:
meta['barcode']= meta['barcode'].apply(lambda x:x.split('_')[1])

In [10]:
meta.to_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/barcodelist3.csv',index=False,header=None)

In [11]:
meta

Unnamed: 0,barcode
0,AAACATCGAAACATCGGAGCTGAA
1,AAACATCGAACAACCACCGAAGTA
2,AAACATCGAACAACCAGCTCGGTA
3,AAACATCGAACCGAGAACAGCAGA
4,AAACATCGAACCGAGATTCACGCA
...,...
16349,TTCACGCATGGAACAACGAACTTA
16351,TTCACGCATGGCTTCAAGCACCTC
16352,TTCACGCATGGCTTCAGCTCGGTA
16353,TTCACGCATGGCTTCATAGGATGA
