In [241]:
import numpy as np
import pandas as pd
import math

In [320]:
def hamming_distance(string1, string2):
    distance = 0
    length = len(string1)
    length2 = len(string2)
    if (length != length2):
        raise Exception(f"string1({length}) and string2({length2}) do not have same length")
    for i in range(length):
        if string1[i] != string2[i]:
            distance += 1
    return distance
def correct_cdr3_nt(umi_dict, percent=0.1):
    """
    Correct umi_dict in place.
    Args:
        umi_dict: {cdr3_nt: umi_count}
        percent: if hamming_distance(low_umi_cdr3, high_umi_cdr3) == 1 and
            low_count / high_count < percent, merge low to high.
    Returns:
        correct_dict: dict {low_umi_cdr3: high_umi_cdr3}
    """
    correct_dict = dict()
    
    umi_arr = sorted(
        umi_dict.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
    
    while True:
        # break when only highest in umi_arr
        if len(umi_arr) == 1:
            break
        umi_low = umi_arr.pop()
        low_seq = umi_low[0]
        low_count = umi_low[1]

        for umi_kv in umi_arr:
            high_seq = umi_kv[0]
            high_count = umi_kv[1]
            if len(low_seq) != len(high_seq):
                break
            if float(low_count / high_count) > percent:
                break
            if hamming_distance(low_seq, high_seq) == 1:
                correct_dict[low_seq] = high_seq
                n_low = umi_dict[low_seq]
                # merge
                umi_dict[high_seq] += n_low
                del (umi_dict[low_seq])
                break
            
    return correct_dict

In [321]:
productive_file = pd.read_csv("/SGRNJ06/randd/USER/cjj/celedev/vdj_bulk/20230209tcr/ts16/R230203023C/04.mapping_vdj/R230203023C_productive.tsv", sep='\t')
chains = ["TRA","TRB"]
productive_file = productive_file[productive_file["chain"].isin(chains)]

In [322]:
productive_file["umi"] = productive_file["sequence_id"].apply(lambda x: x.split('_')[1])

In [323]:
productive_file

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3,umi
0,TCATGA,TCATGA_AAAAAAACCAAACAAC_4,TRB,TRBV7-9,,TRBJ2-7,TGTGCCAGCAGCTTCCTGGGCGAGCAGTACTTC,CASSFLGEQYF,AAAAAAACCAAACAAC
1,TCATGA,TCATGA_AAAAAAAGTGGGGTCA_5,TRB,TRBV19,TRBD2,TRBJ2-1,TGTGCCAGTAGTCGGGGGGCCCACGGGGACAATGAGCAGGTCTTC,CASSRGAHGDNEQVF,AAAAAAAGTGGGGTCA
2,TCATGA,TCATGA_AAAAAAGAAAAGTAGC_9,TRB,TRBV5-1,,TRBJ2-3,TGCGCCAGCAGCTCTAACACAGAAACGCAGTATTTT,CASSSNTETQYF,AAAAAAGAAAAGTAGC
3,TCATGA,TCATGA_AAAAAAGCCTGGACCA_11,TRB,TRBV9,TRBD2,TRBJ2-3,TGTGCCAGCAGACCCAACAGCGGGGGCACAGATACGCAGTATTTT,CASRPNSGGTDTQYF,AAAAAAGCCTGGACCA
4,TCATGA,TCATGA_AAAAAAGGTAGAGTGG_12,TRB,TRBV11-2,,TRBJ2-3,TGTGCCAGCACCCCCAGTTGGGTCGCAGATACGCAGTATTTT,CASTPSWVADTQYF,AAAAAAGGTAGAGTGG
...,...,...,...,...,...,...,...,...,...
98214,TCATGA,TCATGA_TTTTTGTCATAACTTG_189902,TRB,TRBV6-6,,TRBJ1-2,TGTGCCAGCAGTCGTACAGTGAATATCTATGGCTACACCTTC,CASSRTVNIYGYTF,TTTTTGTCATAACTTG
98215,TCATGA,TCATGA_TTTTTTAAGGCCATGC_189908,TRA,TRAV20,,TRAJ57,TGTGCTGCCCCCTCACAGGGCGGATCTGAAAAGCTGGTCTTT,CAAPSQGGSEKLVF,TTTTTTAAGGCCATGC
98216,TCATGA,TCATGA_TTTTTTCGAAAATGAG_189913,TRB,TRBV10-3,TRBD1,TRBJ2-1,TGTGCCATCCATCGAGGGGACAATGAGCAGTTCTTC,CAIHRGDNEQFF,TTTTTTCGAAAATGAG
98217,TCATGA,TCATGA_TTTTTTGAAAACCCAG_189914,TRB,TRBV21-1,TRBD2,TRBJ2-1,TGTGCCAGCAGCTTGCGACTCACGGGGGGGTACAATGAGCAGTTCTTC,CASSLRLTGGYNEQFF,TTTTTTGAAAACCCAG


In [324]:
len(set(productive_file.nSeqCDR3))

38904

In [325]:
len(set(productive_file.aaSeqCDR3))

36309

In [326]:

n_corrected_cdr3_nt, n_corrected_umi = 0, 0
for chain in chains:
    df_tmp = productive_file[productive_file["chain"] == chain]
    groupby_elements = ["chain","nSeqCDR3"]
    clonetypes = df_tmp.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
    clonetypes = clonetypes.sort_values("umi", ascending=False)
    umi_dict = dict(zip(list(clonetypes.nSeqCDR3),list(clonetypes.umi)))
    
    correct_dict = correct_cdr3_nt(umi_dict)
    
    for low_nt, high_nt in correct_dict.items():
        high_aa = df_tmp.loc[df_tmp.nSeqCDR3 == high_nt].aaSeqCDR3.iloc[0]

        productive_file.loc[productive_file.nSeqCDR3 == low_nt, 'aaSeqCDR3'] = high_aa
        productive_file.loc[productive_file.nSeqCDR3 == low_nt, 'nSeqCDR3'] = high_nt

In [327]:
len(set(productive_file.nSeqCDR3))

38721

In [328]:
len(set(productive_file.aaSeqCDR3))

36238

In [310]:
n_corrected_cdr3_nt

183

In [311]:
n_corrected_umi

788

In [329]:
productive_file

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3,umi
0,TCATGA,TCATGA_AAAAAAACCAAACAAC_4,TRB,TRBV7-9,,TRBJ2-7,TGTGCCAGCAGCTTCCTGGGCGAGCAGTACTTC,CASSFLGEQYF,AAAAAAACCAAACAAC
1,TCATGA,TCATGA_AAAAAAAGTGGGGTCA_5,TRB,TRBV19,TRBD2,TRBJ2-1,TGTGCCAGTAGTCGGGGGGCCCACGGGGACAATGAGCAGGTCTTC,CASSRGAHGDNEQVF,AAAAAAAGTGGGGTCA
2,TCATGA,TCATGA_AAAAAAGAAAAGTAGC_9,TRB,TRBV5-1,,TRBJ2-3,TGCGCCAGCAGCTCTAACACAGAAACGCAGTATTTT,CASSSNTETQYF,AAAAAAGAAAAGTAGC
3,TCATGA,TCATGA_AAAAAAGCCTGGACCA_11,TRB,TRBV9,TRBD2,TRBJ2-3,TGTGCCAGCAGACCCAACAGCGGGGGCACAGATACGCAGTATTTT,CASRPNSGGTDTQYF,AAAAAAGCCTGGACCA
4,TCATGA,TCATGA_AAAAAAGGTAGAGTGG_12,TRB,TRBV11-2,,TRBJ2-3,TGTGCCAGCACCCCCAGTTGGGTCGCAGATACGCAGTATTTT,CASTPSWVADTQYF,AAAAAAGGTAGAGTGG
...,...,...,...,...,...,...,...,...,...
98214,TCATGA,TCATGA_TTTTTGTCATAACTTG_189902,TRB,TRBV6-6,,TRBJ1-2,TGTGCCAGCAGTCGTACAGTGAATATCTATGGCTACACCTTC,CASSRTVNIYGYTF,TTTTTGTCATAACTTG
98215,TCATGA,TCATGA_TTTTTTAAGGCCATGC_189908,TRA,TRAV20,,TRAJ57,TGTGCTGCCCCCTCACAGGGCGGATCTGAAAAGCTGGTCTTT,CAAPSQGGSEKLVF,TTTTTTAAGGCCATGC
98216,TCATGA,TCATGA_TTTTTTCGAAAATGAG_189913,TRB,TRBV10-3,TRBD1,TRBJ2-1,TGTGCCATCCATCGAGGGGACAATGAGCAGTTCTTC,CAIHRGDNEQFF,TTTTTTCGAAAATGAG
98217,TCATGA,TCATGA_TTTTTTGAAAACCCAG_189914,TRB,TRBV21-1,TRBD2,TRBJ2-1,TGTGCCAGCAGCTTGCGACTCACGGGGGGGTACAATGAGCAGTTCTTC,CASSLRLTGGYNEQFF,TTTTTTGAAAACCCAG


In [None]:
import pandas as pd

In [None]:
airr = pd.read_csv("/SGRNJ06/randd/USER/cjj/celedev/vdj_bulk/20230209tcr/ts16/R230203023C/04.mapping_vdj/R230203023C_airr.tsv", sep='\t')

In [None]:
airr = airr[airr["locus"].isin(["TRA","TRB"])]
airr = airr[airr["productive"]=="T"]

In [None]:
airr

In [None]:
productive_file["cdr3_len"] = productive_file["aaSeqCDR3"].apply(lambda x: len(x))

In [None]:
productive_file.sort_values("cdr3_len",ascending=False)

In [None]:
clonetypes

In [None]:
umitot = clonetypes[clonetypes["chain"]=="TRA"].umi.shape[0] + clonetypes[clonetypes["chain"]=="TRB"].umi.shape[0]

In [None]:
n_50 = np.median(clonetypes.umi)

In [None]:
n_10 = list(clonetypes.umi)[clonetypes.shape[0]//10]

In [None]:
n_50

In [None]:
n_10

In [None]:
umin = min( n_10, n_50 - (4 * math.sqrt(n_50)) )

In [None]:
umin

In [None]:
math.sqrt(1)

In [None]:
# 如果一对链具有相同长度的 VJ 基因和 CDR3 片段，则合并两个精确的亚克隆型。


In [None]:
df = airr
df.fillna("", inplace=True)

In [None]:
df = df[(df["v_call"]!="") | ((df["d_call"]!="")) | ((df["j_call"]!=""))]

In [None]:
df_cdr3 = df[(df["cdr3_aa"]!="") & (df["junction_aa"]!="")]

In [None]:
df_correct_cdr3 = df_cdr3[~(df_cdr3["cdr3_aa"].str.contains(r"\*")) & ~(df_cdr3["cdr3_aa"].str.contains("X"))]

In [None]:
df_confident = df_correct_cdr3[df_correct_cdr3["productive"]=="T"]

In [None]:
productive_file = df_confident[df_confident["locus"].isin(["TRA","TRB"])]

In [None]:
productive_file

In [None]:
productive_file["v_len"] = productive_file["v_alignment_end"] - productive_file["v_alignment_start"]
productive_file["j_len"] = productive_file["j_alignment_end"] - productive_file["j_alignment_start"]

In [None]:
productive_file = productive_file[productive_file["junction_aa"].str.len()>7]
productive_file = productive_file[productive_file["junction_aa"].str.startswith('C')]

In [None]:
groupby_elements = ["v_len","j_len","junction_aa"]
clonetypes = productive_file.groupby(groupby_elements, as_index=False).count()

In [None]:
clonetypes

In [None]:
len(set(clonetypes["junction_aa"]))