In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("TESTDATA/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites.csv", sep="\t")
print(len(df))
df.head()

15151


Unnamed: 0,cdr3pep,V_sub,J_sub,acc.nunique,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.frac_x.sum,mut.frac_x.mean,mut.count_y.sum,mut.count_y.mean,mut.frac_y.sum,mut.frac_y.mean,nr_sites.sum,nr_sites.mean
0,CARDPNYYDLSGYSYNWFESWGQGTLVT,IGHV3-21,IGHJ5,1016,747,16310.0,16.05315,68.23965,0.067165,2035.0,2.002953,39.938806,0.03931,1012,0.996063
1,CARGNSNGYYSDYWGQGILVT,IGHV4-59,IGHJ4,662,386,4935.0,7.454683,20.505091,0.030974,665.0,1.004532,16.636066,0.02513,10,0.015106
2,CARGSSVGTSPLDYWGQGTLVT,IGHV3-11,IGHJ4,331,265,10612.0,32.060423,41.420112,0.125136,2.0,0.006042,0.05,0.000151,5,0.015106
3,CAKGNSGFYYDYWGQGTLVT,IGHV3-23,IGHJ4,330,202,3480.0,10.545455,14.733139,0.044646,3.0,0.009091,0.075,0.000227,4,0.012121
4,CARDMKGLNTFDYWGQGTLVT,IGHV3-48,IGHJ4,251,168,330.0,1.314741,1.37777,0.005489,2.0,0.007968,0.046512,0.000185,3,0.011952


In [3]:
cols = ['cdr3pep', 'V_sub', 'acc.nunique']
cdr3pep_uniq = df[cols].groupby('cdr3pep').agg({'V_sub': 'nunique', 'acc.nunique': sum})
print(len(cdr3pep_uniq))

# get CDR3's with more than one V gene assigned
cdr3pep_uniq = cdr3pep_uniq.loc[cdr3pep_uniq['V_sub'] > 1]
print(len(cdr3pep_uniq))

cdr3pep_uniq.head()

10022
2546


Unnamed: 0_level_0,V_sub,acc.nunique
cdr3pep,Unnamed: 1_level_1,Unnamed: 2_level_1
CAAAAGTTYPYARWGQGTLVT,2,2
CAAELWRGSNYYYGMDVWGQGTTVT,2,5
CAAGGGIAAAGIGYWGQGTLVT,2,2
CAAGGKTPGFWGQGTLVT,3,8
CAAGGLSSGYCYSNWGQGTLVT,5,17


In [4]:
def reAssign(df, peptide, threshold):
    '''
    Description: reassign genes with gene with the majority of the reads (or include more genes up to 70% of the reads)
    In: df with all information for one CDR3
    Out: new V gene name
    '''
    # get all clones for peptide
    df_tmp = df.loc[df['cdr3pep'] == peptide][['cdr3pep','V_sub','acc.nunique']]
    df_tmp = df_tmp.sort_values(by='acc.nunique', ascending=False)

    # calculate total frequency for this peptide
    total_freq = cdr3pep_uniq['acc.nunique'].loc[peptide]

    # store the cumulative sum
    df_tmp['cumsum'] = df_tmp['acc.nunique'].cumsum()

    # calculate the cumulative fraction
    df_tmp['cumsum_frac'] = df_tmp['cumsum'] / total_freq
    
    # select the genes that make up 70% of the reads for this peptide
    include_up_to = [ e > threshold for e in df_tmp['cumsum_frac'].tolist() ].index(True) + 1
    df_select = df_tmp.iloc[0:include_up_to]
    df_select = df_select.sort_values(by='V_sub', ascending=True)
    
    # concatenate the gene names
    v_gene = "+".join(df_select['V_sub'].tolist())

    return(v_gene)

In [5]:
peptide = 'CAAGGLSSGYCYSNWGQGTLVT'
threshold = 0.7

In [6]:
print(reAssign(df, peptide, threshold))

IGHV3-23


## To do: replace v gene for this CDR3 and loop through all CDR3s