In [1]:
from collections import defaultdict

import pandas as pd
import numpy as np
import pysam
import copy

In [30]:
class Auto():
    """
    threshold = top {percentile}% cell count / coef
    count is usually UMI count.
    >>> array = [50] * 100 + [30] * 100 + [10] * 100 + [4] * 100
    >>> Auto(array, coef=10).run()
    5
    >>> Auto(array, percentile=70, coef=3).run()
    10
    >>> Auto(array, percentile=50, coef=10, expected_cell_num=100).run()
    5
    >>> Auto([1, 2, 20, 30, 40], expected_cell_num=4, percentile=50, coef=10).run()
    2
    """
    def __init__(self, array, percentile=99, coef=3, expected_cell_num=None, **kwargs):
        self.array = [x for x in array if x > 0 ]
        self.percentile = percentile
        self.coef = int(coef)
        self.expected_cell_num = expected_cell_num
        self.kwargs = kwargs
    
    def run(self):
        array = self.array
        if not array:
            return 1

        if not self.expected_cell_num:
            expected_cell_num = len(array)
        else:
            expected_cell_num = self.expected_cell_num
            if expected_cell_num > len(array):
                print('Warning: expected_cell_num > len(array)')
                expected_cell_num = len(array)
                      
        sorted_counts = sorted(array, reverse=True)
        count_cell_percentile = np.percentile(sorted_counts[:expected_cell_num], self.percentile)
        threshold = int(count_cell_percentile / self.coef)

        return threshold
def target_cell_calling(df_UMI_sum, expected_target_cell_num=3000, target_barcodes=None, weight=6, coef=5, 
    percentile=85, umi_col='umis'):
    """
    Args:
        df_UMI_sum: A dataframe with columns highest umi's contig and UMI.
    
    Returns:
        target_contigs_id: list
    >>> df_UMI_sum = pd.DataFrame({"contig_id": ["A", "B", "C", "D", "E"], "UMI": [1, 2, 1, 30, 40]})
    >>> target_contigs_id = target_cell_calling(df_UMI_sum, expected_target_cell_num=5, percentile=80, coef=5, target_barcodes=["A", "C"])
    >>> target_contigs_id == {'A_1', 'C_1', 'D_1', 'E_1'}
    True
    """
    if target_barcodes != None:
        target_barcodes = {i for i in target_barcodes}
    umi_threshold = Auto(list(df_UMI_sum[umi_col]), expected_cell_num=expected_target_cell_num, coef=coef, percentile=percentile).run()

    # avoid change the original dataframe
    df_temp = df_UMI_sum.copy()
    if target_barcodes:
        df_temp[umi_col] = df_temp.apply(
            lambda row:  row[umi_col] * weight if row['barcode'] in target_barcodes else row[umi_col], axis=1)
             
    target_contigs = set(df_temp.loc[df_temp[umi_col] >= umi_threshold].contig_id)

    return target_contigs
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': fl_pro_pair_df.shape[0],
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [4]:
df = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/trust_cr_compare/tr_bcr/HA1_0310PBMC_T5_B_3NL/04.summarize/HA1_0310PBMC_T5_B_3NL_b.csv')

In [5]:
        df['productive'] = df['full_length']
        contig_set = set(df.contig_id)

In [7]:
        len_dict = dict()

        with pysam.FastxFile('/SGRNJ06/randd/USER/cjj/celedev/trust_cr_compare/tr_bcr/HA1_0310PBMC_T5_B_3NL/03.assemble/assemble/HA1_0310PBMC_T5_B_3NL_annotate.fa') as fa:
            for read in fa:
                len_dict[read.name] = read.comment.split(' ')[0] 

In [8]:
df['length'] = df['contig_id'].apply(lambda x: len_dict.get(x))

In [12]:
df['length'] = df['contig_id'].apply(len_dict.get)

In [21]:
df

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,CCAGTTCAAACTCACCGCTAACGA,True,CCAGTTCAAACTCACCGCTAACGA_2185,True,482,IGL,IGLV3-10*01,,IGLJ3*02,,False,False,CFSTDYTGNVRLF,TGTTTCTCCACAGACTACACTGGTAATGTAAGACTGTTC,1.0,1.0,,
1,CATCAAGTGCGAGTAACGGATTGC,True,CATCAAGTGCGAGTAACGGATTGC_13939,True,545,IGH,IGHV4-34*01,IGHD3-10*01,IGHJ5*02,IGHM,True,True,CARGKPRITMVRGGRPGWFDPW,TGTGCGAGAGGCAAGCCACGTATTACTATGGTTCGGGGAGGCCGCC...,1.0,1.0,,
2,CATCAAGTGCGAGTAACGGATTGC,True,CATCAAGTGCGAGTAACGGATTGC_33329,True,150,IGK,IGKV1-NL1*01,,IGKJ4*01,IGKC,False,False,CQQYYSAPLTF,TGTCAACAGTATTATAGTGCCCCTCTCACTTTC,2.0,2.0,,
3,ATTGGCTCAGATCGCAAAGAGATC,True,ATTGGCTCAGATCGCAAAGAGATC_35461,True,150,IGH,IGHV3-48*03,IGHD3-3*01,IGHJ5*02,,False,False,CAREKGFDFSGFDPW,TGTGCGAGAGAGAAAGGGTTCGATTTTTCGGGGTTCGACCCCTGG,1.0,1.0,,
4,ATTGGCTCAGATCGCAAAGAGATC,True,ATTGGCTCAGATCGCAAAGAGATC_1846,True,510,IGL,IGLV2-11*01,,IGLJ2*01,IGLC2,True,True,CSSFAGSVLPVIF,TGCTCCTCATTTGCGGGCAGCGTTTTACCTGTCATTTTC,1.0,1.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,TAGGATGAACCACTGTCATACCAA,True,TAGGATGAACCACTGTCATACCAA_982,True,573,IGK,IGKV3-11*01,,IGKJ2*01,IGKC,True,True,CQQRSNGPMYTF,TGTCAGCAGCGTAGCAACGGTCCCATGTACACTTTT,12.0,12.0,,
7363,CAAGACTAAAACATCGGCTCGGTA,True,CAAGACTAAAACATCGGCTCGGTA_19492,True,542,IGH,IGHV3-23*01,IGHD2-8*01,IGHJ4*02,IGHD,True,True,CATNRGDYW,TGTGCGACCAACCGAGGGGACTACTGG,2.0,2.0,,
7364,CAAGACTAAAACATCGGCTCGGTA,True,CAAGACTAAAACATCGGCTCGGTA_6153,True,552,IGK,IGKV3-20*01,,IGKJ5*01,IGKC,True,True,CQQYAGPPITF,TGTCAGCAATATGCTGGGCCACCGATCACCTTC,9.0,9.0,,
7365,GATGAATCGTCTGTCACTAAGGTC,True,GATGAATCGTCTGTCACTAAGGTC_5195,True,295,IGH,IGHV3-23*01,IGHD6-6*01,IGHJ6*02,IGHG1,False,False,CARELAGRPGRSHYNYYYGMDVW,TGTGCGAGAGAATTAGCAGGTCGTCCCGGGCGGAGTCATTACAATT...,1.0,1.0,,


In [None]:
l1 = df.length.tolist()

In [None]:
df['length'] = df['contig_id'].map(len_dict, na_action='ignore')

In [None]:
l2 = df.length.tolist()

In [None]:
for i in range(len(l1)):
    if l1[i] != l2[i]:
        print(l1[i])
        print(l2[i])

In [None]:
df[df['length']]

In [31]:
            df_chain_heavy = df[df['chain']=='IGH']
            df_chain_light = df[(df['chain']=='IGK') | (df['chain']=='IGL')]
            df_chain_heavy = df_chain_heavy.drop_duplicates(['barcode'])
            df_chain_light = df_chain_light.drop_duplicates(['barcode'])
            df_for_clono = pd.concat([df_chain_heavy, df_chain_light], ignore_index=True)

In [32]:
        trust_report = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/trust_cr_compare/tr_bcr/HA1_0310PBMC_T5_B_3NL/03.assemble/assemble//HA1_0310PBMC_T5_B_3NL_filter_report.tsv', sep='\t')
        correct_cdr3 = set(df_for_clono.cdr3).intersection(set(trust_report.CDR3aa))
        correct_cdr3 = [i for i in correct_cdr3 if i.startswith('C')]
        correct_cdr3 = [i for i in correct_cdr3 if len(i)>=5]
        correct_cdr3 = [i for i in correct_cdr3 if 'UAG' or 'UAA' or 'UGA' not in i]
        df_for_clono = df_for_clono[df_for_clono['cdr3'].isin(correct_cdr3)]

In [33]:
df_for_clono

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,CATCAAGTGCGAGTAACGGATTGC,True,CATCAAGTGCGAGTAACGGATTGC_13939,True,545,IGH,IGHV4-34*01,IGHD3-10*01,IGHJ5*02,IGHM,True,True,CARGKPRITMVRGGRPGWFDPW,TGTGCGAGAGGCAAGCCACGTATTACTATGGTTCGGGGAGGCCGCC...,1.0,1.0,,
1,ATTGGCTCAGATCGCAAAGAGATC,True,ATTGGCTCAGATCGCAAAGAGATC_35461,True,150,IGH,IGHV3-48*03,IGHD3-3*01,IGHJ5*02,,False,False,CAREKGFDFSGFDPW,TGTGCGAGAGAGAAAGGGTTCGATTTTTCGGGGTTCGACCCCTGG,1.0,1.0,,
2,GATAGACAAAGAGATCAAGGACAC,True,GATAGACAAAGAGATCAAGGACAC_35465,True,150,IGH,IGHV4-34*01,IGHD3-22*01,IGHJ4*02,IGHM,False,False,CARGWEVVTSFDYW,TGTGCGAGAGGTTGGGAAGTGGTTACCTCTTTTGACTACTGG,1.0,1.0,,
3,AGCAGGAAAGCACCTCCATACCAA,True,AGCAGGAAAGCACCTCCATACCAA_3757,True,620,IGH,IGHV2-5*01,IGHD3-3*01,IGHJ3*01,IGHG2,True,True,CAHLDSKKSYTYDFW,TGTGCACACTTGGATTCCAAGAAGTCTTACACTTACGATTTTTGG,1.0,1.0,,
4,GACAGTGCCGCTGATCCTAAGGTC,True,GACAGTGCCGCTGATCCTAAGGTC_15483,True,261,IGH,IGHV3-74*01,IGHD3-22*01,IGHJ5*02,IGHD,False,False,CARDGYYYDSSGYAGDWFDPW,TGTGCAAGAGATGGGTATTACTATGATAGTAGTGGTTATGCAGGGG...,37.0,37.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7362,CCAGTTCAGCTAACGATGAAGAGA,True,CCAGTTCAGCTAACGATGAAGAGA_3527,True,571,IGK,IGKV3-15*01,,IGKJ1*01,IGKC,True,True,CQQYNAWPRTF,TGTCAGCAGTATAATGCCTGGCCTCGGACGTTC,1.0,1.0,,
7363,GCGAGTAAAATGTTGCGTACGCAA,True,GCGAGTAAAATGTTGCGTACGCAA_24183,True,457,IGK,IGKV1-NL1*01,,IGKJ4*01,IGKC,True,True,CQQYDNLPLTF,TGTCAACAGTATGATAATCTCCCGCTCACTTTC,2.0,2.0,,
7364,TAGGATGAACCACTGTCATACCAA,True,TAGGATGAACCACTGTCATACCAA_982,True,573,IGK,IGKV3-11*01,,IGKJ2*01,IGKC,True,True,CQQRSNGPMYTF,TGTCAGCAGCGTAGCAACGGTCCCATGTACACTTTT,12.0,12.0,,
7365,CAAGACTAAAACATCGGCTCGGTA,True,CAAGACTAAAACATCGGCTCGGTA_6153,True,552,IGK,IGKV3-20*01,,IGKJ5*01,IGKC,True,True,CQQYAGPPITF,TGTCAGCAATATGCTGGGCCACCGATCACCTTC,9.0,9.0,,


In [34]:
        # 分开过滤
        df_chain_heavy = df_for_clono[df_for_clono['chain']=='IGH']
        df_chain_light = df_for_clono[(df_for_clono['chain']=='IGK') | (df_for_clono['chain']=='IGL')]

In [35]:
        filtered_congtigs_id = set()
        for _df in [df_chain_heavy, df_chain_light]:
            target_contigs = target_cell_calling(
            _df, 
            expected_target_cell_num=3000, 
            target_barcodes=None,
            weight = 6.0,
            coef = 5
            )
            filtered_congtigs_id = filtered_congtigs_id | target_contigs       
        
        df_for_clono = df_for_clono[df_for_clono.contig_id.isin(filtered_congtigs_id)]

In [37]:
df_for_clono

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
4,GACAGTGCCGCTGATCCTAAGGTC,True,GACAGTGCCGCTGATCCTAAGGTC_15483,True,261,IGH,IGHV3-74*01,IGHD3-22*01,IGHJ5*02,IGHD,False,False,CARDGYYYDSSGYAGDWFDPW,TGTGCAAGAGATGGGTATTACTATGATAGTAGTGGTTATGCAGGGG...,37.0,37.0,,
8,CCGAAGTAAGATGTACAGAGTCAA,True,CCGAAGTAAGATGTACAGAGTCAA_11521,True,556,IGH,IGHV4-39*01,IGHD2-2*01,IGHJ4*02,IGHM,True,True,CARREYCSSTDCYDGYW,TGTGCGAGACGGGAATATTGTAGTAGTACCGACTGCTATGATGGTT...,24.0,24.0,,
9,GCTCGGTACATACCAATTCACGCA,True,GCTCGGTACATACCAATTCACGCA_16759,True,312,IGH,IGHV4-39*01,IGHD3-22*01,IGHJ6*02,IGHA1,False,False,CARHRVVGSFSYGYYYGMDVW,TGTGCGAGGCATCGGGTTGTCGGGAGCTTCTCCTATGGATATTATT...,2.0,2.0,,
10,CATCAAGTGTACGCAAGACTAGTA,True,CATCAAGTGTACGCAAGACTAGTA_8101,True,667,IGH,IGHV3-30*04,IGHD3-10*01,IGHJ4*02,IGHG1,True,True,CARGGSYFDHW,TGTGCGCGGGGGGGAAGTTATTTTGACCACTGG,2.0,2.0,,
13,AACTCACCGAGCTGAAACAGATTC,True,AACTCACCGAGCTGAAACAGATTC_2692,True,703,IGH,IGHV3-15*01,IGHD3-3*01,IGHJ4*02,IGHA2,True,True,CTTHPPFYYPSSKKGADYW,TGTACCACACACCCCCCCTTTTATTATCCCTCGTCGAAAAAAGGGG...,16.0,16.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7347,CAGATCTGAGCACCTCCAAGGAGC,True,CAGATCTGAGCACCTCCAAGGAGC_14994,True,426,IGL,IGLV3-10*01,,IGLJ3*02,,False,False,CFSTDNTGNVRLF,TGTTTCTCCACAGACAACACTGGTAATGTAAGACTGTTC,12.0,12.0,,
7352,CATCAAGTGCGAGTAAACCTCCAA,True,CATCAAGTGCGAGTAAACCTCCAA_6868,True,561,IGK,IGKV1-12*01,,IGKJ5*01,IGKC,True,True,CQQANSFPITF,TGTCAACAGGCTAACAGTTTCCCGATCACCTTC,5.0,5.0,,
7359,CTGTAGCCAACGTGATCCTCTATC,True,CTGTAGCCAACGTGATCCTCTATC_17309,True,365,IGL,IGLV3-25*03,,IGLJ1*01,,False,False,CQSIDSSGTYVF,TGTCAATCAATAGACAGCAGTGGGACTTATGTCTTC,7.0,7.0,,
7364,TAGGATGAACCACTGTCATACCAA,True,TAGGATGAACCACTGTCATACCAA_982,True,573,IGK,IGKV3-11*01,,IGKJ2*01,IGKC,True,True,CQQRSNGPMYTF,TGTCAGCAGCGTAGCAACGGTCCCATGTACACTTTT,12.0,12.0,,


In [38]:
get_vj_annot(df_for_clono, ['IGH', 'IGK','IGL'], ['IGH_IGK, IGH_IGL'])

[{'name': 'Cells With Productive V-J Spanning Pair',
  'value': 576,
  'total': 1793},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGK, IGH) Pair',
  'value': 0,
  'total': 1793,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"},
 {'name': 'Cells With IGH Contig',
  'value': 1527,
  'total': 1793,
  'help_info': 'Fraction of cell-associated barcodes with at least one IGH contig annotated as a full or partial V(D)J gene'},
 {'name': 'Cells With CDR3-annotated IGH Contig',
  'value': 1527,
  'total': 1793},
 {'name': 'Cells With V-J Spanning IGH Contig',
  'value': 1157,
  'total': 1793,
  'help_inf

In [None]:
df_UMI_sum = df_for_clono.groupby(['barcode'], as_index=False).agg({"umis": "sum"})

In [None]:
df_UMI_sum

In [None]:
            target_barcodes = target_cell_calling(
            df_UMI_sum, 
            expected_target_cell_num=3000, 
            target_barcodes=None,
            weight = 6.0,
            coef = 5
            )   

In [None]:
        
        df_for_clono = df_for_clono[df_for_clono.barcode.isin(target_barcodes)]

In [None]:
df_for_clono

In [None]:
df_for_clono

In [None]:
        df_for_clono_pro = df_for_clono[df_for_clono['productive']==True]
        cell_barcodes = set(df_for_clono_pro['barcode'])

In [None]:
        df_filter = df[df.barcode.isin(cell_barcodes)]

In [None]:
get_vj_annot(df_filter, ['TRA', 'TRB'], ['TRA_TRB'])

In [84]:
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': fl_pro_pair_df.shape[0],
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [108]:
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df.barcode.value_counts())
    fl_pro_pair_df_one_chain_bc = set(fl_pro_pair_df[fl_pro_pair_df.barcode==1].index)
    fl_pro_pair_df = df[df.barcode.isin(fl_pro_pair_df_one_chain_bc)]
    count = fl_pro_pair_df[fl_pro_pair_df['productive']==True].shape[0]
    
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    count += fl_pro_pair_df[fl_pro_pair_df['barcode']>=2].shape[0]
    
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': count,
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [73]:
df_filter= pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/trust_cr_compare/tr_bcr/HA1_0310PBMC_T5_B_3NL/04.summarize/HA1_0310PBMC_T5_B_3NL_filtered_contig.csv')

In [109]:
get_vj_annot(df_filter, ['IGH','IGK','IGL'], ['IGH_IGK', 'IGH_IGL'])

[{'name': 'Cells With Productive V-J Spanning Pair',
  'value': 1000,
  'total': 1529},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGK) Pair',
  'value': 499,
  'total': 1529,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"},
 {'name': 'Cells With Productive V-J Spanning (IGH, IGL) Pair',
  'value': 395,
  'total': 1529,
  'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start 

In [96]:
df_filter_filter = pd.DataFrame(df_filter.barcode.value_counts())

In [107]:
df_filter_filter[df_filter_filter.barcode==1]

Unnamed: 0,barcode
GTACGCAAAAGAGATCCGCTGATC,1
GTACGCAAACAGATTCGTGTTCTA,1
AGCCATGCGCTAACGAGTCTGTCA,1
AGCACCTCATAGCGACCCTAATCC,1
AAACATCGATGCCTAACAATGGAA,1
...,...
CTGGCATAATTGGCTCAATGTTGC,1
ACAGATTCAGCACCTCAATGTTGC,1
CGACACACTATCAGCAACCTCCAA,1
GCTAACGACAACCACAGTCGTAGA,1


In [80]:
df_filter[df_filter['barcode']=='CCGTGAGACCATCCTCCGCTGATC']

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,clonotype_id
1000,CCGTGAGACCATCCTCCGCTGATC,True,CCGTGAGACCATCCTCCGCTGATC_3090,True,573,IGK,IGKV1-39*01,,IGKJ1*01,IGKC,True,True,CQQSYSTPWTF,TGTCAACAGAGTTACAGTACCCCGTGGACGTTC,5.0,5.0,clonotype78


In [51]:
ten = pd.read_csv('/SGRNJ03/randd/RD20073101_SCOPEv2_TCR/20220322_1/HA1_0310PBMC_T5_B_3NL/03.assemble/HA1_0310PBMC_T5_B_3NL/outs/filtered_contig_annotations.csv')

In [64]:
ten

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
0,AAACCTGAGAAACCAT-1,True,AAACCTGAGAAACCAT-1_contig_1,True,801,IGL,IGLV2-8,,IGLJ2,IGLC2,True,True,CSSYAGGHNVGVF,TGCAGCTCATATGCAGGCGGCCACAATGTGGGGGTATTC,66421,3365,clonotype9,clonotype9_consensus_1
1,AAACCTGAGAAACCAT-1,True,AAACCTGAGAAACCAT-1_contig_2,True,576,Multi,IGHV3-30,,IGKJ1,IGHD,True,False,CVKWHQAF,TGTGTGAAATGGCACCAGGCGTTC,4602,186,clonotype9,
2,AAACCTGAGAAACCTA-1,True,AAACCTGAGAAACCTA-1_contig_1,True,647,IGL,IGLV6-57,,IGLJ3,IGLC2,True,True,CQSYDNREEDSIWMF,TGTCAGTCTTATGATAATAGAGAGGAAGACAGTATTTGGATGTTC,36023,3448,clonotype21,clonotype21_consensus_2
3,AAACCTGAGAAACCTA-1,True,AAACCTGAGAAACCTA-1_contig_2,True,599,IGH,IGHV3-48,,IGHJ4,IGHD,True,True,CARATIETFYYFDSW,TGTGCGAGGGCGACTATCGAAACTTTTTACTATTTTGACTCCTGG,33484,771,clonotype21,clonotype21_consensus_1
4,AAACCTGAGAAACCTA-1,True,AAACCTGAGAAACCTA-1_contig_3,True,555,IGL,IGLV2-8,,IGLJ2,,True,False,CSSYAGSNKVF,TGCAGCTCATATGCAGGCAGCAACAAAGTATTC,41,9,clonotype21,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3325,ACATCAGCATAGTAAG-1,True,ACATCAGCATAGTAAG-1_contig_1,True,527,IGK,IGKV1D-13,,IGKJ2,IGKC,True,True,CQQYYGTPRTF,TGTCAACAGTATTATGGTACCCCTCGAACTTTT,26,3,clonotype930,clonotype930_consensus_1
3326,ACATCAGCATAGTAAG-1,True,ACATCAGCATAGTAAG-1_contig_2,True,638,IGH,IGHV3-33,,IGHJ6,IGHA1,True,True,CAKAQDILTGLLHFGMDVW,TGTGCGAAAGCGCAGGATATTTTGACTGGATTACTCCACTTCGGTA...,53,1,clonotype930,clonotype930_consensus_2
3327,ACATCAGCATAGTAAG-1,True,ACATCAGCATAGTAAG-1_contig_3,True,374,IGK,IGKV4-1,,IGKJ2,IGKC,True,False,CQQYYGTPRTF,TGTCAACAGTATTATGGTACCCCTCGAACTTTT,12,2,clonotype930,clonotype930_consensus_1
3328,ACATCAGCATAGTAAG-1,True,ACATCAGCATAGTAAG-1_contig_4,True,576,IGH,IGHV3-23,,IGHJ6,IGHA1,True,False,CAKAQDILTGLLHFGMDVW,TGTGCGAAAGCGCAGGATATTTTGACTGGATTACTCCACTTCGGTA...,20,3,clonotype930,clonotype930_consensus_2


In [65]:
len(set(ten.barcode))

1124

In [69]:
flten = pd.DataFrame(ten[ten['productive']==True].barcode.value_counts())


In [70]:
flten

Unnamed: 0,barcode
AAACCTGCACAGAGGT-1,8
AAACCTGGTGATGTGG-1,7
AAAGTAGCAAGCCCAC-1,7
AAACCTGAGGGCACTA-1,7
AACCGCGAGTACGTTC-1,7
...,...
AAAGATGTCCCGACTT-1,1
AACACGTGTTAGGGTG-1,1
AACACGTGTTCGTTGA-1,1
AAACCTGTCATCGATG-1,1


In [63]:
flten[flten['barcode']>=2].shape[0]

1087

In [72]:
ten[ten['barcode']=='AAACCTGTCACGCGGT-1']

Unnamed: 0,barcode,is_cell,contig_id,high_confidence,length,chain,v_gene,d_gene,j_gene,c_gene,full_length,productive,cdr3,cdr3_nt,reads,umis,raw_clonotype_id,raw_consensus_id
1049,AAACCTGTCACGCGGT-1,True,AAACCTGTCACGCGGT-1_contig_1,True,743,IGK,IGKV4-1,,IGKJ2,IGKC,True,True,CQQYYTTPNTF,TGTCAACAATATTATACTACTCCTAACACTTTT,1137,4,clonotype303,clonotype303_consensus_1
