In [1]:
import pandas as pd
import pysam
import copy
import subprocess
import numpy as np 
import copy

from collections import defaultdict
from celescope.tools import utils
from celescope.tools.capture.threshold import Auto
from celescope.tools.step import Step, s_common

In [2]:
class Auto():
    """
    threshold = top {percentile}% cell count / coef
    count is usually UMI count.
    >>> array = [50] * 100 + [30] * 100 + [10] * 100 + [4] * 100
    >>> Auto(array, coef=10).run()
    5
    >>> Auto(array, percentile=70, coef=3).run()
    10
    >>> Auto(array, percentile=50, coef=10, expected_cell_num=100).run()
    5
    >>> Auto([1, 2, 20, 30, 40], expected_cell_num=4, percentile=50, coef=10).run()
    2
    """
    def __init__(self, array, percentile=99, coef=3, expected_cell_num=None, **kwargs):
        self.array = [x for x in array if x > 0 ]
        self.percentile = percentile
        self.coef = int(coef)
        self.expected_cell_num = expected_cell_num
        self.kwargs = kwargs
    
    def run(self):
        array = self.array
        if not array:
            return 1

        if not self.expected_cell_num:
            expected_cell_num = len(array)
        else:
            expected_cell_num = self.expected_cell_num
            if expected_cell_num > len(array):
                print('Warning: expected_cell_num > len(array)')
                expected_cell_num = len(array)
                      
        sorted_counts = sorted(array, reverse=True)
        count_cell_percentile = np.percentile(sorted_counts[:expected_cell_num], self.percentile)
        threshold = int(count_cell_percentile / self.coef)

        return threshold
def target_cell_calling(df_UMI_sum, expected_target_cell_num=3000, target_barcodes=None, weight=6, coef=5, 
    percentile=85, umi_col='umis'):
    """
    Args:
        df_UMI_sum: A dataframe with columns highest umi's contig and UMI.
    
    Returns:
        target_contigs_id: list
    >>> df_UMI_sum = pd.DataFrame({"contig_id": ["A", "B", "C", "D", "E"], "UMI": [1, 2, 1, 30, 40]})
    >>> target_contigs_id = target_cell_calling(df_UMI_sum, expected_target_cell_num=5, percentile=80, coef=5, target_barcodes=["A", "C"])
    >>> target_contigs_id == {'A_1', 'C_1', 'D_1', 'E_1'}
    True
    """
    if target_barcodes != None:
        target_barcodes = {i for i in target_barcodes}
    umi_threshold = Auto(list(df_UMI_sum[umi_col]), expected_cell_num=expected_target_cell_num, coef=coef, percentile=percentile).run()

    # avoid change the original dataframe
    df_temp = df_UMI_sum.copy()
    if target_barcodes:
        df_temp[umi_col] = df_temp.apply(
            lambda row:  row[umi_col] * weight if row['barcode'] in target_barcodes else row[umi_col], axis=1)
             
    target_contigs = set(df_temp.loc[df_temp[umi_col] >= umi_threshold].contig_id)

    return target_contigs

In [3]:
df = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/04.summarize/Test-6-VDJ-2_b.csv')

In [4]:
        df['productive'] = df['full_length']
        contig_set = set(df.contig_id)

In [5]:
        len_dict = dict()

        with pysam.FastxFile('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/03.assemble/assemble/Test-6-VDJ-2_annotate.fa') as fa:
            for read in fa:
                len_dict[read.name] = read.comment.split(' ')[0] 

In [6]:
df['length'] = df['contig_id'].apply(len_dict.get)

In [7]:
df

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,CCAGCGATCGTTGACA,True,CCAGCGATCGTTGACA_55152,True,565,IGH,IGHV4-34*01,IGHD4-11*01,IGHJ6*02,IGHG2B*03,True,True,CARERDYSNWDYYHYYGMDVW,TGTGCGAGAGAGAGGGACTACAGTAACTGGGATTATTACCACTACT...,1.0,1.0,,
1,CCAGCGATCGTTGACA,True,CCAGCGATCGTTGACA_14416,True,495,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,3.0,3.0,,
2,TCGCGAGGTAGGCTGA,True,TCGCGAGGTAGGCTGA_45962,True,155,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,False,False,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,3.0,3.0,,
3,ACACTGAAGAGTGAGA,True,ACACTGAAGAGTGAGA_222255,True,242,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,False,False,CQQRGNRPPTF,TGTCAGCAACGTGGCAACCGGCCTCCGACGTTC,2.0,2.0,,
4,ACATGGTCAACACCCG,True,ACATGGTCAACACCCG_76338,True,198,IGL,IGLV1*01,,IGLJ1*01,IGLC1*01,False,False,CVLWYSNHWVF,TGTGTTCTATGGTACAGCAACCATTGGGTGTTC,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176310,ATCTACTCATCACGTA,True,ATCTACTCATCACGTA_21842,True,150,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,False,False,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,1.0,1.0,,
176311,CTCGTACTCAGAGCTT,True,CTCGTACTCAGAGCTT_16107,True,509,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,5.0,5.0,,
176312,GTCAAGTGTTAAGACA,True,GTCAAGTGTTAAGACA_5741,True,512,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,3.0,3.0,,
176313,AAACGGGTCACCCGAG,True,AAACGGGTCACCCGAG_232276,True,150,IGH,IGHV3-21*01,IGHD3-10*01,IGHJ3*02,IGHM*04,False,False,CARGSGSYAFDIW,TGTGCGAGAGGTTCGGGGAGTTATGCTTTTGATATCTGG,1.0,1.0,,


In [8]:
            df_chain_heavy = df[df['chain']=='IGH']
            df_chain_light = df[(df['chain']=='IGK') | (df['chain']=='IGL')]
            df_chain_heavy = df_chain_heavy.drop_duplicates(['barcode'])
            df_chain_light = df_chain_light.drop_duplicates(['barcode'])
            df_for_clono = pd.concat([df_chain_heavy, df_chain_light], ignore_index=True)

In [9]:
        trust_report = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/03.assemble/assemble/Test-6-VDJ-2_filter_report.tsv', sep='\t')
        correct_cdr3 = set(df_for_clono.cdr3).intersection(set(trust_report.CDR3aa))
        correct_cdr3 = [i for i in correct_cdr3 if i.startswith('C')]
        correct_cdr3 = [i for i in correct_cdr3 if len(i)>=5]
        correct_cdr3 = [i for i in correct_cdr3 if 'UAG' or 'UAA' or 'UGA' not in i]
        df_for_clono = df_for_clono[df_for_clono['cdr3'].isin(correct_cdr3)]

In [10]:
            df_chain_heavy = df_for_clono[df_for_clono['chain']=='IGH']
            df_chain_light = df_for_clono[(df_for_clono['chain']=='IGK') | (df_for_clono['chain']=='IGL')]

In [11]:
        filtered_congtigs_id = set()
        for _df in [df_chain_heavy, df_chain_light]:
            target_contigs = target_cell_calling(
            _df, 
            expected_target_cell_num=15000, 
            target_barcodes=None,
            weight = 6.0,
            coef = 5
            )
            filtered_congtigs_id = filtered_congtigs_id | target_contigs       
        
        df_for_clono = df_for_clono[df_for_clono.contig_id.isin(filtered_congtigs_id)]
        df_for_clono_pro = df_for_clono[df_for_clono['productive']==True]
        cell_barcodes = set(df_for_clono_pro['barcode'])

In [12]:
        filtered_contig = set(df_for_clono.contig_id)

In [None]:
df_for_clono_pro['chain_cdr3aa'] = df_for_clono_pro[['chain', 'cdr3']].apply(':'.join, axis=1)
df_for_clono_pro['chain_cdr3nt'] = df_for_clono_pro[['chain', 'cdr3_nt']].apply(':'.join, axis=1)

In [None]:
df_for_clono_pro.sort_values('barcode',ascending=False, inplace=True)

In [None]:
df_for_clono_pro

In [None]:
        cbs = set(df_for_clono_pro['barcode'])
        clonotypes = open('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/20220616clonotypes/clonotypes.csv', 'w')
        clonotypes.write('barcode\tcdr3s_aa\tcdr3s_nt\n')
        for cb in cbs:
            temp = df_for_clono_pro[df_for_clono_pro['barcode']==cb]
            temp = temp.sort_values(by='chain', ascending=True)
            aa_chain = ';'.join(list(temp['chain_cdr3aa']))
            nt_chain = ';'.join(list(temp['chain_cdr3nt']))
            clonotypes.write(f'{cb}\t{aa_chain}\t{nt_chain}\n')
        clonotypes.close()

In [32]:
df_clonotypes = pd.read_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/20220616clonotypes/clonotypes.csv', sep='\t', index_col=None)

In [33]:
contig_with_clonotype = copy.deepcopy(df_clonotypes)

In [34]:
contig_with_clonotype

Unnamed: 0,barcode,cdr3s_aa,cdr3s_nt
0,AACCATGTCATCTGCC,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC
1,CAAGATCTCAGCTCTC,IGK:CQQRNNWPPTF,IGK:TGTCAGCAGCGTAACAACTGGCCTCCGACGTTC
2,CCCAGTTTCCAAACTG,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC
3,ACGATACCAAGGGTCA,IGH:CARGIAVAALFDSW,IGH:TGTGCGAGAGGTATCGCAGTGGCTGCCCTCTTTGACTCCTGG
4,ACAGCTATCGAATGGG,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC
...,...,...,...
15330,TTGACTTGTTGGTAAA,IGK:CQQRTNWPPTF,IGK:TGTCAGCAGCGTACCAACTGGCCTCCGACGTTC
15331,GGTATTGCAGGTGGAT,IGH:CAREPDYSNYWFDPW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGAACCTGACTACAGTAATTACTGGTTCGACCCC...
15332,CTCGAGGAGACAAAGG,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC
15333,CGTTCTGCAAAGCGGT,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC


In [35]:
df_dict = df_clonotypes[["cdr3s_nt", "cdr3s_aa"]].set_index("cdr3s_nt").to_dict(orient='dict')['cdr3s_aa']

In [36]:
df_clonotypes = df_clonotypes.groupby('cdr3s_nt', as_index=False).agg({'barcode': 'count'})

In [37]:
df_clonotypes.rename(columns={'barcode': 'frequency'}, inplace=True)

In [38]:
sum_f = df_clonotypes['frequency'].sum()

In [39]:
df_clonotypes

Unnamed: 0,cdr3s_nt,frequency
0,IGH:TGCGCGAAAAGGCCGTCGAGTATAGCAGCCCCCTTTGACTACTGG,3
1,IGH:TGTACAAAAAATGGGAACTACTACGATTTGGACGTCTGG;IG...,1
2,IGH:TGTACAAAAGATGGGAACTACTACGCTATGGACGTCTGG;IG...,1
3,IGH:TGTACAACAGGCCCTCGGTACAACTGGTACGACGTCGACTTT...,1
4,IGH:TGTACAAGAGAACGTATAACACTGGCTGCTTTTGATGTCTGG...,1
...,...,...
1633,IGL:TGTGCTCTATGGTACAGCAACCATTTGGTGTTC,1
1634,IGL:TGTGCTCTCTTATACAGCAACCATTGGGTGTTC,1
1635,IGL:TGTGGTGTGGGTGATACAATTAAGGAACAATTTGTGTATGTTTTC,6
1636,IGL:TGTGTTCTATGGTACAGCAACCATTGGGTGTTC,1


In [40]:
        df_clonotypes['proportion'] = df_clonotypes['frequency'].apply(lambda x: x/sum_f)
        df_clonotypes.sort_values(by='frequency', ascending=False, inplace=True)
        df_clonotypes['clonotype_id'] = [f'clonotype{i}' for i in range(1, df_clonotypes.shape[0]+1)]
        df_clonotypes['cdr3s_aa'] = df_clonotypes['cdr3s_nt'].apply(lambda x:df_dict[x])
        df_clonotypes = df_clonotypes.reindex(columns=['clonotype_id', 'frequency', 'proportion', 'cdr3s_aa', 'cdr3s_nt'])

In [41]:
df_clonotypes

Unnamed: 0,clonotype_id,frequency,proportion,cdr3s_aa,cdr3s_nt
1557,clonotype1,7399,0.482491,IGK:CQQRSNWPPTF,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC
1084,clonotype2,355,0.023150,IGH:CARGADPW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGGGGCCGACCCCTGG;IGK:TGTCAGCAGCGTA...
1068,clonotype3,329,0.021454,IGH:CARGADPW,IGH:TGTGCGAGAGGGGCCGACCCCTGG
1058,clonotype4,236,0.015390,IGH:CARGADYW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGGGGCAGACTACTGG;IGK:TGTCAGCAGCGTA...
1258,clonotype5,178,0.011607,IGH:CARSIVGATINWFDPW,IGH:TGTGCGAGGTCGATAGTGGGAGCTACCATCAACTGGTTCGAC...
...,...,...,...,...,...
631,clonotype1634,1,0.000065,IGH:CARDHDYTNYDYFHYYGVDVW,IGH:TGTGCGAGAGACCATGACTACACTAACTACGATTACTTCCAC...
629,clonotype1635,1,0.000065,IGH:CARELGLAARPRYYYGMDVW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGAATTAGGGCTAGCAGCTCGTCCTAGGTACTAC...
628,clonotype1636,1,0.000065,IGH:CAREWWESMDVW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGAATGGTGGGAGAGTATGGACGTCTGG;IGK:T...
627,clonotype1637,1,0.000065,IGH:CAREVAGSYYYGMDVW;IGK:CQQRSNWPPTF,IGH:TGTGCGAGAGAAGTGGCTGGTTCCTACTACTACGGTATGGAC...


In [42]:
used_for_merge = df_clonotypes[['cdr3s_nt','clonotype_id']]

In [43]:
used_for_merge

Unnamed: 0,cdr3s_nt,clonotype_id
1557,IGK:TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,clonotype1
1084,IGH:TGTGCGAGAGGGGCCGACCCCTGG;IGK:TGTCAGCAGCGTA...,clonotype2
1068,IGH:TGTGCGAGAGGGGCCGACCCCTGG,clonotype3
1058,IGH:TGTGCGAGAGGGGCAGACTACTGG;IGK:TGTCAGCAGCGTA...,clonotype4
1258,IGH:TGTGCGAGGTCGATAGTGGGAGCTACCATCAACTGGTTCGAC...,clonotype5
...,...,...
631,IGH:TGTGCGAGAGACCATGACTACACTAACTACGATTACTTCCAC...,clonotype1634
629,IGH:TGTGCGAGAGAATTAGGGCTAGCAGCTCGTCCTAGGTACTAC...,clonotype1635
628,IGH:TGTGCGAGAGAATGGTGGGAGAGTATGGACGTCTGG;IGK:T...,clonotype1636
627,IGH:TGTGCGAGAGAAGTGGCTGGTTCCTACTACTACGGTATGGAC...,clonotype1637


In [44]:
df_merge = pd.merge(used_for_merge, contig_with_clonotype, on='cdr3s_nt', how='outer')

In [45]:
df_merge = df_merge[['barcode', 'clonotype_id']]

In [46]:
df_merge

Unnamed: 0,barcode,clonotype_id
0,AACCATGTCATCTGCC,clonotype1
1,CCCAGTTTCCAAACTG,clonotype1
2,ACAGCTATCGAATGGG,clonotype1
3,AGCTTGACATTAGGCT,clonotype1
4,AAGCCGCCATTCCTCG,clonotype1
...,...,...
15330,GTGCATATCTCACATT,clonotype1634
15331,GGATGTTGTTCACCTC,clonotype1635
15332,GTGCTTCCAAGAGGCT,clonotype1636
15333,GCTTGAATCAAGGCTT,clonotype1637


In [59]:
df_all_contig = pd.merge(df_merge, df, on='barcode',how='outer')

In [60]:
df_all_contig.fillna('',inplace = True)

In [61]:
df_all_contig = df_all_contig[['barcode', 'is_cell', 'contig_id', 'high_confidence', 'length', 'chain', 'v_gene', 'd_gene', 'j_gene', 'c_gene', 'full_length', 'productive', 'cdr3', 'cdr3_nt', 'reads', 'umis', 'clonotype_id']]

In [62]:
df_all_contig

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,clonotype_id
0,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_71680,True,567,IGH,IGHV3-48*03,,IGHJ5*02,IGHG2B*03,True,True,CARGADPW,TGTGCGAGAGGGGCCGACCCCTGG,2.0,2.0,clonotype1
1,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_1499,True,502,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,32.0,32.0,clonotype1
2,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_86019,True,591,IGH,IGHV3-23*01,IGHD1-7*01,IGHJ4*02,IGHG2C*03,True,True,CAKGGYNWNYENFDYW,TGTGCGAAAGGGGGTTATAACTGGAACTACGAGAACTTTGACTACTGG,4.0,4.0,clonotype1
3,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_5358,True,670,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*02,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,27.0,27.0,clonotype1
4,ACAGCTATCGAATGGG,True,ACAGCTATCGAATGGG_77798,True,566,IGH,IGHV5-51*01,IGHD3-10*01,IGHJ5*02,IGHG2C*03,True,True,CARQSSGSYYNWFDPW,TGTGCGAGACAGAGTTCGGGGAGTTATTATAACTGGTTCGACCCCTGG,3.0,3.0,clonotype1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
176310,ATCTACTCATCACGTA,True,ATCTACTCATCACGTA_21842,True,150,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,False,False,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,1.0,1.0,
176311,CTCGTACTCAGAGCTT,True,CTCGTACTCAGAGCTT_16107,True,509,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,5.0,5.0,
176312,GTCAAGTGTTAAGACA,True,GTCAAGTGTTAAGACA_5741,True,512,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,3.0,3.0,
176313,AAACGGGTCACCCGAG,True,AAACGGGTCACCCGAG_232276,True,150,IGH,IGHV3-21*01,IGHD3-10*01,IGHJ3*02,IGHM*04,False,False,CARGSGSYAFDIW,TGTGCGAGAGGTTCGGGGAGTTATGCTTTTGATATCTGG,1.0,1.0,


In [None]:
df_filter_contig = df_all_contig[df_all_contig['contig_id'].isin(filtered_contig)]

In [81]:
df_filter_contig = df_all_contig[df_all_contig['barcode'].isin(cell_barcodes)]

In [85]:
df_filter_contig.loc[~df_filter_contig.contig_id.isin(filtered_contig), 'clonotype_id']=''

In [86]:
df_filter_contig

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,clonotype_id
0,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_71680,True,567,IGH,IGHV3-48*03,,IGHJ5*02,IGHG2B*03,True,True,CARGADPW,TGTGCGAGAGGGGCCGACCCCTGG,2.0,2.0,
1,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_1499,True,502,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,32.0,32.0,clonotype1
2,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_86019,True,591,IGH,IGHV3-23*01,IGHD1-7*01,IGHJ4*02,IGHG2C*03,True,True,CAKGGYNWNYENFDYW,TGTGCGAAAGGGGGTTATAACTGGAACTACGAGAACTTTGACTACTGG,4.0,4.0,
3,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_5358,True,670,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*02,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,27.0,27.0,clonotype1
4,ACAGCTATCGAATGGG,True,ACAGCTATCGAATGGG_77798,True,566,IGH,IGHV5-51*01,IGHD3-10*01,IGHJ5*02,IGHG2C*03,True,True,CARQSSGSYYNWFDPW,TGTGCGAGACAGAGTTCGGGGAGTTATTATAACTGGTTCGACCCCTGG,3.0,3.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29538,GTGCTTCCAAGAGGCT,True,GTGCTTCCAAGAGGCT_719,True,523,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,214.0,214.0,clonotype1636
29539,GCTTGAATCAAGGCTT,True,GCTTGAATCAAGGCTT_52253,True,581,IGH,IGHV3-48*03,IGHD2-1*01,IGHJ6*02,IGHG3*03,True,True,CAREVAGSYYYGMDVW,TGTGCGAGAGAAGTGGCTGGTTCCTACTACTACGGTATGGACGTCTGG,9.0,9.0,clonotype1637
29540,GCTTGAATCAAGGCTT,True,GCTTGAATCAAGGCTT_3103,True,517,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,69.0,69.0,clonotype1637
29541,CAGGTGCCAGGCTCAC,True,CAGGTGCCAGGCTCAC_171608,True,573,IGH,IGHV4-34*01,IGHD3-3*01,IGHJ5*02,IGHG1*01,True,True,CAAKEYYDFWSGHFNWFDPW,TGTGCGGCCAAGGAGTATTACGATTTTTGGAGTGGTCACTTCAATT...,5.0,5.0,


In [69]:
len(set(df_filter_contig.barcode))

15335

In [70]:
df_all_contig.to_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/20220616clonotypes/all_contig.csv', sep=',', index=False)

In [87]:
df_filter_contig.to_csv('/SGRNJ06/randd/USER/cjj/TESTDATA/test_trust/20220616clonotypes/filtered_contig.csv', sep=',', index=False)

In [67]:
len(set(df_filter_contig[df_filter_contig['productive']==True].barcode))

15335

In [73]:
df_filter_contig

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,clonotype_id
0,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_71680,True,567,IGH,IGHV3-48*03,,IGHJ5*02,IGHG2B*03,True,True,CARGADPW,TGTGCGAGAGGGGCCGACCCCTGG,2.0,2.0,clonotype1
1,AACCATGTCATCTGCC,True,AACCATGTCATCTGCC_1499,True,502,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,32.0,32.0,clonotype1
2,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_86019,True,591,IGH,IGHV3-23*01,IGHD1-7*01,IGHJ4*02,IGHG2C*03,True,True,CAKGGYNWNYENFDYW,TGTGCGAAAGGGGGTTATAACTGGAACTACGAGAACTTTGACTACTGG,4.0,4.0,clonotype1
3,CCCAGTTTCCAAACTG,True,CCCAGTTTCCAAACTG_5358,True,670,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*02,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,27.0,27.0,clonotype1
4,ACAGCTATCGAATGGG,True,ACAGCTATCGAATGGG_77798,True,566,IGH,IGHV5-51*01,IGHD3-10*01,IGHJ5*02,IGHG2C*03,True,True,CARQSSGSYYNWFDPW,TGTGCGAGACAGAGTTCGGGGAGTTATTATAACTGGTTCGACCCCTGG,3.0,3.0,clonotype1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29538,GTGCTTCCAAGAGGCT,True,GTGCTTCCAAGAGGCT_719,True,523,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,214.0,214.0,clonotype1636
29539,GCTTGAATCAAGGCTT,True,GCTTGAATCAAGGCTT_52253,True,581,IGH,IGHV3-48*03,IGHD2-1*01,IGHJ6*02,IGHG3*03,True,True,CAREVAGSYYYGMDVW,TGTGCGAGAGAAGTGGCTGGTTCCTACTACTACGGTATGGACGTCTGG,9.0,9.0,clonotype1637
29540,GCTTGAATCAAGGCTT,True,GCTTGAATCAAGGCTT_3103,True,517,IGK,IGKV3-11*01,,IGKJ1*01,IGKC*03,True,True,CQQRSNWPPTF,TGTCAGCAGCGTAGCAACTGGCCTCCGACGTTC,69.0,69.0,clonotype1637
29541,CAGGTGCCAGGCTCAC,True,CAGGTGCCAGGCTCAC_171608,True,573,IGH,IGHV4-34*01,IGHD3-3*01,IGHJ5*02,IGHG1*01,True,True,CAAKEYYDFWSGHFNWFDPW,TGTGCGGCCAAGGAGTATTACGATTTTTGGAGTGGTCACTTCAATT...,5.0,5.0,clonotype1638


In [71]:
    def get_vdj_metric(df, chains, pairs):
        """
        Add vdj metrics in html.
        """
        metric_result = []
        fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
        fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
        cell_nums = len(set(df['barcode']))

        metric_result.append({
            'name': 'Cells With Productive V-J Spanning Pair',
            'value': fl_pro_pair_df.shape[0],
            'total': cell_nums,
        })

        for pair in pairs:
            chain1, chain2 = pair.split('_')[0], pair.split('_')[1]
            cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode)
            cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode)
            paired_cbs = len(cbs1.intersection(cbs2))

            metric_result.append({
                'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
                'value': paired_cbs,
                'total': cell_nums,
                'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
            })

        for chain in chains:
        
            metric_result.append({
                'name': f'Cells With {chain} Contig',
                'value': len(set(df[df['chain']==chain].barcode)),
                'total': cell_nums,
                'help_info': f'Fraction of cell-associated barcodes with at least one {chain} contig annotated as a full or partial V(D)J gene'
            })
            metric_result.append({
                'name': f'Cells With CDR3-annotated {chain} Contig',
                'value': len(set(df[(df['chain']==chain)&(df['cdr3']!=None)].barcode)),
                'total': cell_nums,
            })
            metric_result.append({
                'name': f'Cells With V-J Spanning {chain} Contig',
                'value': len(set(df[(df['full_length']==True)&(df['chain']==chain)].barcode)),
                'total': cell_nums,
                'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {chain}"
            })
            metric_result.append({
                'name': f'Cells With Productive {chain} Contig',
                'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain)].barcode)),
                'total': cell_nums,
                'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
            })

        return metric_result 

In [None]:
get_vdj_metric(df_filter_contig,['IGH', 'IGL', 'IGK'],['IGH_IGL', 'IGH_IGK'])

[{'name': 'Cells With Productive V-J Spanning Pair',
  'value': 10947,
  'total': 15335},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGL) Pair',
  'value': 7,
  'total': 15335,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGK) Pair',
  'value': 10940,
  'total': 15335,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a st