In [None]:
import pandas as pd
import glob

In [None]:
def simpson_di(data):

    """ Given a hash { 'species': count } , returns the Simpson Diversity Index
    
    >>> simpson_di({'a': 10, 'b': 20, 'c': 30,})
    0.3888888888888889
    """

    def p(n, N):
        """ Relative abundance """
        if n == 0:
            return 0
        else:
            return float(n)/N

    N = sum(data.values())
    
    return sum(p(n, N)**2 for n in data.values() if n != 0)


def inverse_simpson_di(data):
    """ Given a hash { 'species': count } , returns the inverse Simpson Diversity Index
    
    >>> inverse_simpson_di({'a': 10, 'b': 20, 'c': 30,})
    2.571428571428571
    """
    return float(1)/simpson_di(data)

In [None]:
productive_file = pd.read_csv("/SGRNJ06/randd/USER/cjj/celedev/vdj_bulk/20230630test/Mus_0508spleen_T3lib/05.count_vdj/Mus_0508spleen_T3lib_corrected_productive.tsv", sep='\t')

In [None]:
productive_file

In [None]:
df = pd.DataFrame()

In [None]:
index_set = set(productive_file.barcode)
CDR3_seq_type = ["nSeqCDR3", "aaSeqCDR3"]
clonotypes_nt = pd.DataFrame()
clonotypes_aa = pd.DataFrame()

In [None]:
seq = "aaSeqCDR3"

In [None]:
for index in index_set:
    df_clonetypes = productive_file[productive_file["barcode"]==index]
    groupby_elements = ["chain", "aaSeqCDR3"]
    df_clonetypes = df_clonetypes.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
    df_clonetypes[seq] = df_clonetypes.loc[:, ["chain", seq]].apply(':'.join, axis=1)
    df_clonetypes = df_clonetypes.sort_values("umi", ascending=False).reset_index()
    df_clonetypes["ClonotypeID"] = pd.Series(df_clonetypes.index) + 1
    df_clonetypes = df_clonetypes.rename(columns={"umi": "Frequency"})
    
    sum_frequency = sum(df_clonetypes["Frequency"])
    df_clonetypes["Proportion"] = df_clonetypes["Frequency"].apply(lambda x : x / sum_frequency)
    proportion_list = df_clonetypes["Proportion"].tolist()
    df_clonetypes["Proportion"] = df_clonetypes["Proportion"].apply(lambda x: str(round(x*100, 2)) + '%' )
    df_clonetypes = df_clonetypes[["ClonotypeID", seq, "Frequency", "Proportion"]]
    df_clonetypes["Index"] = index
    if seq == "aaSeqCDR3":
        data = dict(zip(df_clonetypes["aaSeqCDR3"],clonetypes["Frequency"]))
        clonotype_diversity = inverse_simpson_di(data)
        df_clonetypes.loc[df_clonetypes.Index == index, "diversity"] = clonotype_diversity
        clonotypes_aa = pd.concat([clonotypes_aa, df_clonetypes])

In [None]:
clonotypes_aa

In [None]:
final_df_aa

In [None]:
        index_set = set(productive_file.barcode)
        cdr3_types = ["nSeqCDR3", "aaSeqCDR3"]
        final_df_aa = pd.DataFrame()
        final_df_nt = pd.DataFrame()
        
        def format_file(cdr3_type):
            """_summary_

            :param cdr3_type: _description_
            :type cdr3_type: _type_
            """
            global final_df_aa, final_df_nt
            
            groupby_elements = ["chain", cdr3_type]
            df_clonetypes = productive_file[productive_file["barcode"]==index]
            df_clonetypes = df_clonetypes.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
            df_clonetypes[cdr3_type] = df_clonetypes.loc[:, ["chain", cdr3_type]].apply(':'.join, axis=1)
            df_clonetypes = df_clonetypes.sort_values("umi", ascending=False).reset_index()
            df_clonetypes["ClonotypeID"] = pd.Series(df_clonetypes.index) + 1
            df_clonetypes = df_clonetypes.rename(columns={"umi": "Frequency"})
    
            sum_frequency = sum(df_clonetypes["Frequency"])
            df_clonetypes["Proportion"] = df_clonetypes["Frequency"].apply(lambda x : x / sum_frequency)
            proportion_list = df_clonetypes["Proportion"].tolist()
            df_clonetypes["Proportion"] = df_clonetypes["Proportion"].apply(lambda x: str(round(x*100, 2)) + '%' )
            df_clonetypes = df_clonetypes[["ClonotypeID", cdr3_type, "Frequency", "Proportion"]]
            df_clonetypes["Index"] = index

            data = dict(zip(df_clonetypes[cdr3_type]),(df_clonetypes["Frequency"]))
            clonotype_diversity = round(inverse_simpson_di(data), 2)
            df_clonetypes.loc[df_clonetypes.Index == index, "diversity"] = clonotype_diversity
            if cdr3_type == "aaSeqCDR3":
                final_df_aa = pd.concat([final_df_aa, df_clonetypes])
            else:
                final_df_nt = pd.concat([final_df_nt, df_clonetypes])


        for index in index_set:
            for cdr3_type in cdr3_types:
                format_file(cdr3_type)

In [126]:
final_df_aa

Unnamed: 0,ClonotypeID,aaSeqCDR3,Frequency,Proportion,Index,diversity
0,1,TRB:CASGDRLGGSQNTLYF,105,2.34%,ACTTGT,463.14
1,2,TRA:CAASANSGTYQRF,36,0.8%,ACTTGT,463.14
2,3,TRA:CALSRNTGYQNFYF,30,0.67%,ACTTGT,463.14
3,4,TRA:CAASADYGNEKITF,30,0.67%,ACTTGT,463.14
4,5,TRA:CAASVDNYAQGLTF,27,0.6%,ACTTGT,463.14
...,...,...,...,...,...,...
1577,1578,TRB:CASSDRGYQNTLYF,1,0.02%,CAAGTT,677.15
1578,1579,TRB:CASSDVGGRSYEQYF,1,0.02%,CAAGTT,677.15
1579,1580,TRB:CASSDWDWGSEQYF,1,0.02%,CAAGTT,677.15
1580,1581,TRB:CASSDWDWSQNTLYX,1,0.02%,CAAGTT,677.15


In [127]:
final_df_aa.to_csv("/SGRNJ06/randd/USER/cjj/celedev/vdj_bulk/20230630test/Mus_0508spleen_T3lib/test.csv", sep=',', index=False)

In [129]:
{productive_file.barcode}

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [None]:
for index in productive_file.barcode:
    print(index)

In [None]:
in_file = productive_file.copy()

In [None]:
in_file

In [None]:
in_file.loc[:, ["barcode", "chain", "aaSeqCDR3"]].apply('_'.join, axis=1)

In [None]:
grouped_elements = ["barcode", "chain", "aaSeqCDR3"]

In [None]:
in_file.groupby(grouped_elements).agg({'umi':'count'})

In [None]:
in_file['umi'] = in_file.groupby(grouped_elements).transform({'umi':'count'})

In [None]:
in_file['umis'] = in_file.groupby(grouped_elements)['umi'].transform('count')

In [124]:
in_file

Unnamed: 0,barcode,sequence_id,chain,bestVGene,bestDGene,bestJGene,nSeqCDR3,aaSeqCDR3,umi,umis
0,AACTCC,AACTCC_AAAAAATCGAGAATCG_10,TRA,TRAV8-2,,TRAJ26,TGTGCTACAGATAATAACTATGCCCAGGGATTAACCTTC,CATDNNYAQGLTF,AAAAAATCGAGAATCG,17
1,AACTCC,AACTCC_AAAAGAGCGGGTAATG_78,TRB,TRBV12-2,,TRBJ1-5,TGTGCCAGCTCTCTAGGGTATAACAACCAGGCTCCGCTTTTT,CASSLGYNNQAPLF,AAAAGAGCGGGTAATG,5
2,AACTCC,AACTCC_AAAAGATAAGGGTTAG_81,TRB,TRBV31,,TRBJ2-3,TGTGCCTGGAGTGAGGGTAGTGCAGAAACGCTGTATTTT,CAWSEGSAETLYF,AAAAGATAAGGGTTAG,3
3,AACTCC,AACTCC_AAAAGATAAGGGTTTG_83,TRB,TRBV31,,TRBJ2-3,TGTGCCTGGAGTGAGGGTAGTGCAGAAACGCTGTATTTT,CAWSEGSAETLYF,AAAAGATAAGGGTTTG,3
4,AACTCC,AACTCC_AAAAGGTGAATGTCGG_88,TRA,TRAV9-2,,TRAJ23,TGTGTTTTGAGCGCGGGTTATAACCAGGGGAAGCTTATCTTT,CVLSAGYNQGKLIF,AAAAGGTGAATGTCGG,13
...,...,...,...,...,...,...,...,...,...,...
533875,TTGACT,TTGACT_TTTTGGAGAGGAACAG_3068205,TRA,TRAV13-1,,TRAJ40,TGTGCTTTAAGAGGAAACTACAAATACGTCTTT,CALRGNYKYVF,TTTTGGAGAGGAACAG,9
533876,TTGACT,TTGACT_TTTTGGGGTCACCAGG_3068209,TRB,TRBV3,,TRBJ1-3,TGTGCCAGCAGCTTAGACAGAAATACGCTCTATTTT,CASSLDRNTLYF,TTTTGGGGTCACCAGG,3
533877,TTGACT,TTGACT_TTTTGTACCCACTGGG_3068215,TRB,TRBV13-2,TRBD1,TRBJ2-5,TGTGCCAGCGGTGACAGGGGGCAAGACACCCAGTACTTT,CASGDRGQDTQYF,TTTTGTACCCACTGGG,3
533878,TTGACT,TTGACT_TTTTGTACTCACTGGG_3068217,TRB,TRBV13-2,TRBD1,TRBJ2-5,TGTGCCAGCGGTGACAGGGGGCAAGACACCCAGTACTTT,CASGDRGQDTQYF,TTTTGTACTCACTGGG,3


In [130]:
df_table = final_df_aa.groupby('Index').head(100)

In [131]:
df_table = 

Unnamed: 0,ClonotypeID,aaSeqCDR3,Frequency,Proportion,Index,diversity
0,1,TRB:CASGDRLGGSQNTLYF,105,2.34%,ACTTGT,463.14
1,2,TRA:CAASANSGTYQRF,36,0.8%,ACTTGT,463.14
2,3,TRA:CALSRNTGYQNFYF,30,0.67%,ACTTGT,463.14
3,4,TRA:CAASADYGNEKITF,30,0.67%,ACTTGT,463.14
4,5,TRA:CAASVDNYAQGLTF,27,0.6%,ACTTGT,463.14
...,...,...,...,...,...,...
95,96,TRA:CALERGSALGRLHF,12,0.18%,CAAGTT,677.15
96,97,TRA:CALGDITGNTGKLIF,12,0.18%,CAAGTT,677.15
97,98,TRA:CAIDLHDSGYNKLTF,12,0.18%,CAAGTT,677.15
98,99,TRA:CALTSNYAQGLTF,12,0.18%,CAAGTT,677.15


In [143]:
final_df_aa.diversity

0       463.14
1       463.14
2       463.14
3       463.14
4       463.14
         ...  
1577    677.15
1578    677.15
1579    677.15
1580    677.15
1581    677.15
Name: diversity, Length: 136858, dtype: float64

In [None]:
index_set

In [148]:
sample_list, type_list, count_list, diversity_list = [], [], [], []
for index in index_set:
    df_tmp = final_df_aa[final_df_aa["Index"]==index]
    sample_list.append(index)
    type_list.append(df_tmp.shape[0])
    count_list.append(sum(df_tmp.Frequency))
    diversity_list.append(df_tmp.Diversity[0])

In [151]:
diversity_list

[463.14,
 472.86,
 384.69,
 690.26,
 512.53,
 384.27,
 438.1,
 399.81,
 633.76,
 397.58,
 519.56,
 567.18,
 364.24,
 487.95,
 450.07,
 553.04,
 467.41,
 648.92,
 336.26,
 571.78,
 521.31,
 512.54,
 500.27,
 505.64,
 569.93,
 471.78,
 632.15,
 502.46,
 486.16,
 494.67,
 502.25,
 865.14,
 415.03,
 548.62,
 501.58,
 525.52,
 399.52,
 429.1,
 554.88,
 487.64,
 677.0,
 591.8,
 670.7,
 535.92,
 558.38,
 527.32,
 644.79,
 498.14,
 487.44,
 519.34,
 416.25,
 572.88,
 643.82,
 421.3,
 462.91,
 288.42,
 408.67,
 601.85,
 488.19,
 551.35,
 545.29,
 359.11,
 510.64,
 480.54,
 563.48,
 584.3,
 434.67,
 537.27,
 383.05,
 525.95,
 495.83,
 580.5,
 581.02,
 738.05,
 573.14,
 364.37,
 440.09,
 342.5,
 350.84,
 512.67,
 521.68,
 510.9,
 510.51,
 477.76,
 537.69,
 464.23,
 579.53,
 562.69,
 550.16,
 418.69,
 418.15,
 356.76,
 528.91,
 455.89,
 511.28,
 677.15]