In [None]:
from collections import defaultdict

import pandas as pd
import numpy as np
import pysam
import copy

In [None]:
class Auto():
    """
    threshold = top {percentile}% cell count / coef
    count is usually UMI count.
    >>> array = [50] * 100 + [30] * 100 + [10] * 100 + [4] * 100
    >>> Auto(array, coef=10).run()
    5
    >>> Auto(array, percentile=70, coef=3).run()
    10
    >>> Auto(array, percentile=50, coef=10, expected_cell_num=100).run()
    5
    >>> Auto([1, 2, 20, 30, 40], expected_cell_num=4, percentile=50, coef=10).run()
    2
    """
    def __init__(self, array, percentile=99, coef=3, expected_cell_num=None, **kwargs):
        self.array = [x for x in array if x > 0 ]
        self.percentile = percentile
        self.coef = int(coef)
        self.expected_cell_num = expected_cell_num
        self.kwargs = kwargs
    
    def run(self):
        array = self.array
        if not array:
            return 1

        if not self.expected_cell_num:
            expected_cell_num = len(array)
        else:
            expected_cell_num = self.expected_cell_num
            if expected_cell_num > len(array):
                print('Warning: expected_cell_num > len(array)')
                expected_cell_num = len(array)
                      
        sorted_counts = sorted(array, reverse=True)
        count_cell_percentile = np.percentile(sorted_counts[:expected_cell_num], self.percentile)
        threshold = int(count_cell_percentile / self.coef)

        return threshold
def target_cell_calling(df_UMI_sum, expected_target_cell_num=3000, target_barcodes=None, weight=6, coef=5, 
    percentile=85, umi_col='umis'):
    """
    Args:
        df_UMI_sum: A dataframe with columns highest umi's contig and UMI.
    
    Returns:
        target_contigs_id: list
    >>> df_UMI_sum = pd.DataFrame({"contig_id": ["A", "B", "C", "D", "E"], "UMI": [1, 2, 1, 30, 40]})
    >>> target_contigs_id = target_cell_calling(df_UMI_sum, expected_target_cell_num=5, percentile=80, coef=5, target_barcodes=["A", "C"])
    >>> target_contigs_id == {'A_1', 'C_1', 'D_1', 'E_1'}
    True
    """
    if target_barcodes != None:
        target_barcodes = {i for i in target_barcodes}
    umi_threshold = Auto(list(df_UMI_sum[umi_col]), expected_cell_num=expected_target_cell_num, coef=coef, percentile=percentile).run()

    # avoid change the original dataframe
    df_temp = df_UMI_sum.copy()
    if target_barcodes:
        df_temp[umi_col] = df_temp.apply(
            lambda row:  row[umi_col] * weight if row['barcode'] in target_barcodes else row[umi_col], axis=1)
             
    target_contigs = set(df_temp.loc[df_temp[umi_col] >= umi_threshold].contig_id)

    return target_contigs
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': fl_pro_pair_df.shape[0],
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [None]:
df = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/04.summarize/Test-6-VDJ-2_b.csv')

In [None]:
        df['productive'] = df['full_length']
        contig_set = set(df.contig_id)

In [None]:
        len_dict = dict()

        with pysam.FastxFile('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/03.assemble/assemble/Test-6-VDJ-2_annotate.fa') as fa:
            for read in fa:
                len_dict[read.name] = read.comment.split(' ')[0] 

In [None]:
df['length'] = df['contig_id'].apply(lambda x: len_dict.get(x))

In [None]:
df['length'] = df['contig_id'].apply(len_dict.get)

In [None]:
df

In [None]:
l1 = df.length.tolist()

In [None]:
df['length'] = df['contig_id'].map(len_dict, na_action='ignore')

In [None]:
l2 = df.length.tolist()

In [None]:
for i in range(len(l1)):
    if l1[i] != l2[i]:
        print(l1[i])
        print(l2[i])

In [None]:
df[df['length']]

In [None]:
            df_chain_heavy = df[df['chain']=='IGH']
            df_chain_light = df[(df['chain']=='IGK') | (df['chain']=='IGL')]
            df_chain_heavy = df_chain_heavy.drop_duplicates(['barcode'])
            df_chain_light = df_chain_light.drop_duplicates(['barcode'])
            df_for_clono = pd.concat([df_chain_heavy, df_chain_light], ignore_index=True)

In [None]:
        trust_report = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/03.assemble/assemble/Test-6-VDJ-2_filter_report.tsv', sep='\t')
        correct_cdr3 = set(df_for_clono.cdr3).intersection(set(trust_report.CDR3aa))
        correct_cdr3 = [i for i in correct_cdr3 if i.startswith('C')]
        correct_cdr3 = [i for i in correct_cdr3 if len(i)>=5]
        correct_cdr3 = [i for i in correct_cdr3 if 'UAG' or 'UAA' or 'UGA' not in i]
        df_for_clono = df_for_clono[df_for_clono['cdr3'].isin(correct_cdr3)]

In [None]:
df_for_clono

In [None]:
        # 分开过滤
        df_chain_heavy = df_for_clono[df_for_clono['chain']=='IGH']
        df_chain_light = df_for_clono[(df_for_clono['chain']=='IGK') | (df_for_clono['chain']=='IGL')]

In [None]:
        filtered_congtigs_id = set()
        for _df in [df_chain_heavy, df_chain_light]:
            target_contigs = target_cell_calling(
            _df, 
            expected_target_cell_num=15000, 
            target_barcodes=None,
            weight = 6.0,
            coef = 5
            )
            filtered_congtigs_id = filtered_congtigs_id | target_contigs       
        
        df_for_clono = df_for_clono[df_for_clono.contig_id.isin(filtered_congtigs_id)]

In [None]:
df_for_clono

In [None]:
get_vj_annot(df_for_clono, ['IGH', 'IGK','IGL'], ['IGH_IGK, IGH_IGL'])

In [None]:
df_UMI_sum = df_for_clono.groupby(['barcode'], as_index=False).agg({"umis": "sum"})

In [None]:
df_UMI_sum

In [None]:
            target_barcodes = target_cell_calling(
            df_UMI_sum, 
            expected_target_cell_num=3000, 
            target_barcodes=None,
            weight = 6.0,
            coef = 5
            )   

In [None]:
        
        df_for_clono = df_for_clono[df_for_clono.barcode.isin(target_barcodes)]

In [None]:
df_for_clono

In [None]:
df_for_clono

In [None]:
        df_for_clono_pro = df_for_clono[df_for_clono['productive']==True]
        cell_barcodes = set(df_for_clono_pro['barcode'])

In [None]:
        df_filter = df[df.barcode.isin(cell_barcodes)]

In [None]:
get_vj_annot(df_filter, ['TRA', 'TRB'], ['TRA_TRB'])

In [None]:
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    fl_pro_pair_df = fl_pro_pair_df[fl_pro_pair_df['barcode']>=2]
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': fl_pro_pair_df.shape[0],
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [None]:
def get_vj_annot(df, chains, pairs):
    fl_pro_pair_df = pd.DataFrame(df.barcode.value_counts())
    fl_pro_pair_df_one_chain_bc = set(fl_pro_pair_df[fl_pro_pair_df.barcode==1].index)
    fl_pro_pair_df = df[df.barcode.isin(fl_pro_pair_df_one_chain_bc)]
    count = fl_pro_pair_df[fl_pro_pair_df['productive']==True].shape[0]
    
    fl_pro_pair_df = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
    count += fl_pro_pair_df[fl_pro_pair_df['barcode']>=2].shape[0]
    
    Result = []
    cell_nums = len(set(df['barcode'].tolist()))
    Result.append({
        'name': 'Cells With Productive V-J Spanning Pair',
        'value': count,
        'total': cell_nums,
    })
    for p in pairs:
        chain1 = p.split('_')[0]
        chain2 = p.split('_')[1]
        cbs1 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain1)].barcode.tolist())
        cbs2 = set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==chain2)].barcode.tolist())
        paired_cbs = len(cbs1.intersection(cbs2))
        Result.append({
            'name': f'Cells With Productive V-J Spanning ({chain1}, {chain2}) Pair',
            'value': paired_cbs,
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with one productive contig for each chain of the receptor pair.A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })
    for c in chains:
        Result.append({
            'name': f'Cells With {c} Contig',
            'value': len(set(df[df['chain']==c].barcode.tolist())),
            'total': cell_nums,
            'help_info': f'Fraction of cell-associated barcodes with at least one {c} contig annotated as a full or partial V(D)J gene'
        })
        Result.append({
            'name': f'Cells With CDR3-annotated {c} Contig',
            'value': len(set(df[(df['chain']==c)&(df['cdr3']!=None)].barcode.tolist())),
            'total': cell_nums,
        })
        Result.append({
            'name': f'Cells With V-J Spanning {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': f"Fraction of cell-associated barcodes with at least one contig spanning the 5' end of the V region to the 3' end of the J region for {c}"
        })
        Result.append({
            'name': f'Cells With Productive {c} Contig',
            'value': len(set(df[(df['full_length']==True)&(df['productive']==True)&(df['chain']==c)].barcode.tolist())),
            'total': cell_nums,
            'help_info': "Fraction of cell-associated barcodes with productive IGL chain. A productive contig satisfies the following conditions: the contig annotations span the 5' end of the V region to the 3' end of the J region of the chain, a start codon was found in the expected part of the V sequence, an in-frame CDR3 amino acid motif was found, and no stop codons were found in the aligned V-J region"
        })

    return Result

In [None]:
        bc_count = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
        bc_count_one_chain = set(bc_count[bc_count.barcode==1].index)
        vdj_spanning_pair = df[df.barcode.isin(bc_count_one_chain)]
        count = vdj_spanning_pair[vdj_spanning_pair['productive']==True].shape[0]
    
        vdj_spanning_pair = pd.DataFrame(df[df['productive']==True].barcode.value_counts())
        count += vdj_spanning_pair[vdj_spanning_pair['barcode']>=2].shape[0]

In [None]:
df_filter= pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cs_vdj/Test-6-VDJ-2/04.summarize/Test-6-VDJ-2_filtered_contig.csv')

In [None]:
get_vj_annot(df_filter, ['IGH','IGK','IGL'], ['IGH_IGK', 'IGH_IGL'])

In [None]:
df_filter_filter = pd.DataFrame(df_filter.barcode.value_counts())

In [None]:
df_filter_filter[df_filter_filter.barcode==1]

In [None]:
df_filter[df_filter['barcode']=='CCGTGAGACCATCCTCCGCTGATC']

In [None]:
ten = pd.read_csv('/SGRNJ06/randd/USER/cjj/celedev/kemai/20220609cr_vdj/7VDJ2/Test-7-VDJ-2/outs/filtered_contig_annotations.csv')

In [None]:
ten

In [None]:
len(set(ten.barcode))

In [None]:
flten = pd.DataFrame(ten[ten['productive']==True].barcode.value_counts())


In [None]:
flten

In [None]:
flten[flten['barcode']>=2].shape[0]

In [None]:
ten[ten['barcode']=='ACGAGGAGTGCACTTA-1']