In [31]:
import pandas as pd
import numpy as np
import numbers
from celescope.tools import utils
from celescope.tools.step import Step, s_common
from celescope.vdj.__init__ import CHAINS


def format_value(value, total):
    if not isinstance(value, numbers.Number):
        return value
    display = str(format(value, ','))
    if total:
        fraction = round(value / total * 100, 2)
        display += f'({fraction}%)'
    return display


def correct_cdr3_nt(umi_dict, percent=0.1):
    """
    Correct umi_dict in place.
    Args:
        umi_dict: {cdr3_nt: umi_count}
        percent: if hamming_distance(low_umi_cdr3, high_umi_cdr3) == 1 and
            low_count / high_count < percent, merge low to high.
    Returns:
        correct_dict: dict {low_umi_cdr3: high_umi_cdr3}
    """
    correct_dict = dict()
    
    umi_arr = sorted(
        umi_dict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    
    while True:
        # break when only highest in umi_arr
        if len(umi_arr) <= 1:
            break
        umi_low = umi_arr.pop()
        low_seq = umi_low[0]
        low_count = umi_low[1]

        for umi_kv in umi_arr:
            high_seq = umi_kv[0]
            high_count = umi_kv[1]
            if len(low_seq) != len(high_seq):
                break
            if float(low_count / high_count) > percent:
                break
            if utils.hamming_distance(low_seq, high_seq) == 1:
                correct_dict[low_seq] = high_seq
                n_low = umi_dict[low_seq]
                # merge
                umi_dict[high_seq] += n_low
                del (umi_dict[low_seq])
                break
            
    return correct_dict

In [23]:
chains = ["TRA", "TRB"]

In [24]:
productive_file = pd.read_csv("/SGRNJ06/randd/PROJECT/RD20073101_ScRNA_VDJ/bulk_TCR/20230814_spike-in/H_0807K562_CF_3_T3lib/04.mapping_vdj//H_0807K562_CF_3_T3lib_productive.tsv", sep='\t')

In [25]:
productive_file = productive_file[productive_file["chain"].isin(chains)]

In [26]:
productive_file

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3
0,ACGTCT,ACGTCT_AAAAAGGTGTGTTCCA_56,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
1,ACGTCT,ACGTCT_AAAACCACTGGTGGGG_89,TRB,TRBV3-1,,TRBJ2-3,TGTGCNAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
2,ACGTCT,ACGTCT_AAAACGCGAAAGATCC_110,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
3,ACGTCT,ACGTCT_AAACGGACCATTGATG_276,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
4,ACGTCT,ACGTCT_AAACGTTTGTGACACG_288,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
...,...,...,...,...,...,...,...,...
33932,TTCTCG,TTCTCG_TTTTGGGGAGTGCCCA_222454,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
33933,TTCTCG,TTCTCG_TTTTGGTGAGTGCCCA_222458,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
33934,TTCTCG,TTCTCG_TTTTGGTGATTATTGG_222459,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF
33935,TTCTCG,TTCTCG_TTTTTGCGAGTGCCCA_222529,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF


In [27]:
productive_file["umi"] = productive_file["sequence_id"].apply(lambda x: x.split('_')[1])

In [41]:
        for chain in chains:
            df_tmp = productive_file[productive_file["chain"] == chain]
            groupby_elements = ["chain","nSeqCDR3"]
            clonetypes = df_tmp.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
            clonetypes = clonetypes.sort_values("umi", ascending=False)
            umi_dict = dict(zip(list(clonetypes.nSeqCDR3),list(clonetypes.umi)))
#             if not umi_dict:
#                 continue
                
            correct_dict = correct_cdr3_nt(umi_dict)
    
            for low_nt, high_nt in correct_dict.items():
                high_aa = df_tmp.loc[df_tmp.nSeqCDR3 == high_nt].aaSeqCDR3.iloc[0]
                productive_file.loc[productive_file.nSeqCDR3 == low_nt, 'aaSeqCDR3'] = high_aa
                productive_file.loc[productive_file.nSeqCDR3 == low_nt, 'nSeqCDR3'] = high_nt

In [42]:
productive_file

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3,umi
0,ACGTCT,ACGTCT_AAAAAGGTGTGTTCCA_56,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,AAAAAGGTGTGTTCCA
1,ACGTCT,ACGTCT_AAAACCACTGGTGGGG_89,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,AAAACCACTGGTGGGG
2,ACGTCT,ACGTCT_AAAACGCGAAAGATCC_110,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,AAAACGCGAAAGATCC
3,ACGTCT,ACGTCT_AAACGGACCATTGATG_276,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,AAACGGACCATTGATG
4,ACGTCT,ACGTCT_AAACGTTTGTGACACG_288,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,AAACGTTTGTGACACG
...,...,...,...,...,...,...,...,...,...
33932,TTCTCG,TTCTCG_TTTTGGGGAGTGCCCA_222454,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,TTTTGGGGAGTGCCCA
33933,TTCTCG,TTCTCG_TTTTGGTGAGTGCCCA_222458,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,TTTTGGTGAGTGCCCA
33934,TTCTCG,TTCTCG_TTTTGGTGATTATTGG_222459,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,TTTTGGTGATTATTGG
33935,TTCTCG,TTCTCG_TTTTTGCGAGTGCCCA_222529,TRB,TRBV3-1,,TRBJ2-3,TGTGCCAGCAGCCTTGGGACAGATACGCAGTATTTT,CASSLGTDTQYF,TTTTTGCGAGTGCCCA


In [54]:
df_tmp = productive_file[productive_file["chain"] == "TRA"]

In [55]:
df_tmp

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3,umi


In [56]:
            groupby_elements = ["chain","nSeqCDR3"]
            clonetypes = df_tmp.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
            clonetypes = clonetypes.sort_values("umi", ascending=False)
            umi_dict = dict(zip(list(clonetypes.nSeqCDR3),list(clonetypes.umi)))

In [57]:
        umi_dict

{}

In [51]:
            correct_dict = correct_cdr3_nt(umi_dict)

In [52]:
correct_dict

{}

In [53]:
for low_nt, high_nt in correct_dict.items():
    print(low_nt)
    print(high_nt)