In [18]:
import pandas as pd 
import numpy as np 

In [8]:
def simpson_di(data):

    """ Given a hash { 'species': count } , returns the Simpson Diversity Index
    
    >>> simpson_di({'a': 10, 'b': 20, 'c': 30,})
    0.3888888888888889
    """

    def p(n, N):
        """ Relative abundance """
        if n == 0:
            return 0
        else:
            return float(n)/N

    N = sum(data.values())
    
    return sum(p(n, N)**2 for n in data.values() if n != 0)


def inverse_simpson_di(data):
    """ Given a hash { 'species': count } , returns the inverse Simpson Diversity Index
    
    >>> inverse_simpson_di({'a': 10, 'b': 20, 'c': 30,})
    2.571428571428571
    """
    return float(1)/simpson_di(data)

In [9]:
productive_file = pd.read_csv("/SGRNJ06/randd/USER/cjj/celedev/vdj_bulk/20230726_1/Hum_0630PBMC_1w_T3lib/05.count_vdj/Hum_0630PBMC_1w_T3lib_corrected_productive.tsv", sep='\t')
index_set = set(productive_file.barcode)
cdr3_types = ["nSeqCDR3", "aaSeqCDR3"]
final_df_aa = pd.DataFrame()
final_df_nt = pd.DataFrame()

In [10]:
        def format_files(index, cdr3_type):
            """format clonotypes.csv and clonotypes_nt.csv from productive file."""
            global final_df_aa, final_df_nt
            
            groupby_elements = ["chain", cdr3_type]
            df_clonetypes = productive_file[productive_file["barcode"]==index]
            df_clonetypes = df_clonetypes.groupby(groupby_elements, as_index=False).agg({"umi": "count"})
            df_clonetypes[cdr3_type] = df_clonetypes.loc[:, ["chain", cdr3_type]].apply(':'.join, axis=1)
            df_clonetypes = df_clonetypes.sort_values("umi", ascending=False).reset_index()
            df_clonetypes["ClonotypeID"] = pd.Series(df_clonetypes.index) + 1
            df_clonetypes = df_clonetypes.rename(columns={"umi": "Frequency"})
    
            sum_frequency = sum(df_clonetypes["Frequency"])
            df_clonetypes["Proportion"] = df_clonetypes["Frequency"].apply(lambda x : x / sum_frequency)
            df_clonetypes["Proportion"] = df_clonetypes["Proportion"].apply(lambda x: str(round(x*100, 2)) + '%' )
            df_clonetypes = df_clonetypes[["ClonotypeID", cdr3_type, "Frequency", "Proportion"]]
            df_clonetypes["Index"] = index

            # calculate diversity
            data = dict(zip(df_clonetypes[cdr3_type], df_clonetypes["Frequency"]))
            clonotype_diversity = round(inverse_simpson_di(data), 2)
            df_clonetypes["Diversity"] = clonotype_diversity
            # df_clonetypes.loc[df_clonetypes.Index == index, "Diversity"] = clonotype_diversity
            if cdr3_type == "aaSeqCDR3":
                final_df_aa = pd.concat([final_df_aa, df_clonetypes])
            else:
                final_df_nt = pd.concat([final_df_nt, df_clonetypes])

In [11]:
        for index in index_set:
            for cdr3_type in cdr3_types:
                format_files(index, cdr3_type)

In [13]:
        df_table = final_df_aa.groupby('Index').head(100)
        df_table = df_table[["Index", "ClonotypeID", "aaSeqCDR3", "Frequency", "Proportion", "Diversity"]]

In [14]:
df_table

Unnamed: 0,Index,ClonotypeID,aaSeqCDR3,Frequency,Proportion,Diversity
0,TTAAGG,1,TRB:CASSPPVAPSYNEQFF,330,4.02%,146.91
1,TTAAGG,2,TRA:CVVKDTGGFKTIF,192,2.34%,146.91
2,TTAAGG,3,TRA:CAVETLGGYNKLIF,156,1.9%,146.91
3,TTAAGG,4,TRB:CSVELGEQFF,130,1.58%,146.91
4,TTAAGG,5,TRB:CASSPSGNSYNEQFF,126,1.54%,146.91
...,...,...,...,...,...,...
95,GATCAC,96,TRB:CASKRTEAYEQYF,35,0.22%,131.13
96,GATCAC,97,TRB:CASMETSGQETQYF,35,0.22%,131.13
97,GATCAC,98,TRB:CASSPRTSGREYNEQFF,34,0.21%,131.13
98,GATCAC,99,TRA:CAASMKEGTSYDKVIF,34,0.21%,131.13


In [16]:
df_table

Unnamed: 0,Index,ClonotypeID,aaSeqCDR3,Frequency,Proportion,Diversity
0,TTAAGG,1,TRB:CASSPPVAPSYNEQFF,330,4.02%,146.91
1,TTAAGG,2,TRA:CVVKDTGGFKTIF,192,2.34%,146.91
2,TTAAGG,3,TRA:CAVETLGGYNKLIF,156,1.9%,146.91
3,TTAAGG,4,TRB:CSVELGEQFF,130,1.58%,146.91
4,TTAAGG,5,TRB:CASSPSGNSYNEQFF,126,1.54%,146.91
...,...,...,...,...,...,...
95,GATCAC,96,TRB:CASKRTEAYEQYF,35,0.22%,131.13
96,GATCAC,97,TRB:CASMETSGQETQYF,35,0.22%,131.13
97,GATCAC,98,TRB:CASSPRTSGREYNEQFF,34,0.21%,131.13
98,GATCAC,99,TRA:CAASMKEGTSYDKVIF,34,0.21%,131.13


In [21]:
mean_diversity = round(np.mean(df_table.Diversity), 2)

In [22]:
mean_diversity

139.02