In [1]:
import pandas as pd
import numpy as np

In [2]:
# Input
cloneFile = "TESTDATA/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites.csv"
allinfoFile = "final/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv"
threshold = 0.7

In [3]:
# Output
outfile = cloneFile.replace("-clones-mut-sites.csv", "-clones-mut-sites-reassigned.csv")

In [4]:
def reAssign(df, peptide, threshold):
    '''
    Description: reassign genes with gene with the majority of the reads (or include more genes up to 70% of the reads)
    In: df with all clones, peptide is the CDR3 peptide, threshold is for how many genes need to be included in the description (70% of the reads)
    Out: new V gene name
    '''
    # get all clones for peptide
    df_tmp = df.loc[df['cdr3pep'] == peptide][['cdr3pep','V_sub','acc.nunique']]
    df_tmp = df_tmp.sort_values(by='acc.nunique', ascending=False)

    # calculate total frequency for this peptide
    total_freq = cdr3pep_uniq['acc.nunique'].loc[peptide]

    # store the cumulative sum of the reads
    df_tmp['cumsum'] = df_tmp['acc.nunique'].cumsum()

    # calculate the cumulative fraction of the reads
    df_tmp['cumsum_frac'] = df_tmp['cumsum'] / total_freq
    
    # select the genes that make up 70% of the reads for this peptide
    include_up_to = [ e > threshold for e in df_tmp['cumsum_frac'].tolist() ].index(True) + 1
    df_select = df_tmp.iloc[0:include_up_to]
    df_select = df_select.sort_values(by='V_sub', ascending=True)
    
    # concatenate the gene names with a plus sign
    v_gene = "+".join(df_select['V_sub'].tolist())
    
    # replace v name with new v name
    df.loc[df['cdr3pep'] == peptide, 'V_sub'] = v_gene

    return(df, v_gene)

In [5]:
# read cloneFile and put it in a dataframe
df = pd.read_csv(cloneFile, sep="\t")
print(len(df))
df.head()

15151


Unnamed: 0,cdr3pep,V_sub,J_sub,acc.nunique,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.frac_x.sum,mut.frac_x.mean,mut.count_y.sum,mut.count_y.mean,mut.frac_y.sum,mut.frac_y.mean,nr_sites.sum,nr_sites.mean
0,CARDPNYYDLSGYSYNWFESWGQGTLVT,IGHV3-21,IGHJ5,1016,747,16310.0,16.05315,68.23965,0.067165,2035.0,2.002953,39.938806,0.03931,1012,0.996063
1,CARGNSNGYYSDYWGQGILVT,IGHV4-59,IGHJ4,662,386,4935.0,7.454683,20.505091,0.030974,665.0,1.004532,16.636066,0.02513,10,0.015106
2,CARGSSVGTSPLDYWGQGTLVT,IGHV3-11,IGHJ4,331,265,10612.0,32.060423,41.420112,0.125136,2.0,0.006042,0.05,0.000151,5,0.015106
3,CAKGNSGFYYDYWGQGTLVT,IGHV3-23,IGHJ4,330,202,3480.0,10.545455,14.733139,0.044646,3.0,0.009091,0.075,0.000227,4,0.012121
4,CARDMKGLNTFDYWGQGTLVT,IGHV3-48,IGHJ4,251,168,330.0,1.314741,1.37777,0.005489,2.0,0.007968,0.046512,0.000185,3,0.011952


In [6]:
# group by cdr3peptide and count nr of different V genes and reads
cols = ['cdr3pep', 'V_sub', 'acc.nunique']
cdr3pep_uniq = df[cols].groupby('cdr3pep').agg({'V_sub': 'nunique', 'acc.nunique': sum})
print(len(cdr3pep_uniq))

# select CDR3's with more than one V gene assigned
cdr3pep_uniq = cdr3pep_uniq.loc[cdr3pep_uniq['V_sub'] > 1]
print(len(cdr3pep_uniq))

cdr3pep_uniq.head()

10022
2546


Unnamed: 0_level_0,V_sub,acc.nunique
cdr3pep,Unnamed: 1_level_1,Unnamed: 2_level_1
CAAAAGTTYPYARWGQGTLVT,2,2
CAAELWRGSNYYYGMDVWGQGTTVT,2,5
CAAGGGIAAAGIGYWGQGTLVT,2,2
CAAGGKTPGFWGQGTLVT,3,8
CAAGGLSSGYCYSNWGQGTLVT,5,17


## Loop through all CDR3s and re-assign the V gene

In [7]:
for peptide in cdr3pep_uniq.index:
    (df, new_v_gene) = reAssign(df, peptide, threshold)
print("DONE")

DONE


In [8]:
df.loc[df['cdr3pep'] == peptide]

Unnamed: 0,cdr3pep,V_sub,J_sub,acc.nunique,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.frac_x.sum,mut.frac_x.mean,mut.count_y.sum,mut.count_y.mean,mut.frac_y.sum,mut.frac_y.mean,nr_sites.sum,nr_sites.mean
2590,YARVTDYVSQNYWGQGTLVT,IGHV3-7,IGHJ4,3,1,66.0,22.0,0.276151,0.09205,3.0,1.0,0.076923,0.025641,0,0.0
7284,YARVTDYVSQNYWGQGTLVT,IGHV3-7,IGHJ4,1,1,15.0,15.0,0.072816,0.072816,1.0,1.0,0.025641,0.025641,0,0.0


In [9]:
# Group the re-assigned entries
cols = ['cdr3pep', 'V_sub', 'J_sub']
clones = df.groupby(cols).agg({'acc.nunique': sum, 'beforeMID.nunique': sum, 'mut.count_x.sum': sum, 'mut.count_x.mean': np.mean, 'mut.frac_x.sum': sum, 'mut.frac_x.mean': np.mean, 'mut.count_y.sum': sum, 'mut.count_y.mean': np.mean, 'mut.frac_y.sum': sum, 'mut.frac_y.mean': np.mean, 'nr_sites.sum': sum, 'nr_sites.mean': np.mean})
clones = clones.sort_values(by='acc.nunique', ascending=False)
clones = clones.reset_index()

In [10]:
clones.head()

Unnamed: 0,cdr3pep,V_sub,J_sub,acc.nunique,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.frac_x.sum,mut.frac_x.mean,mut.count_y.sum,mut.count_y.mean,mut.frac_y.sum,mut.frac_y.mean,nr_sites.sum,nr_sites.mean
0,CARDPNYYDLSGYSYNWFESWGQGTLVT,IGHV3-21,IGHJ5,1257,986,19263.0,13.477084,80.644138,0.056758,2515.0,1.997867,49.365386,0.039224,1124,0.523617
1,CARGNSNGYYSDYWGQGILVT,IGHV4-59,IGHJ4,787,508,6203.0,9.225825,25.640163,0.038121,790.0,1.000453,19.764485,0.025018,21,0.085796
2,CARGSSVGTSPLDYWGQGTLVT,IGHV3-11,IGHJ4,454,385,12487.0,14.419085,49.239703,0.06076,2.0,0.000403,0.05,1e-05,14,0.103894
3,CAKGNSGFYYDYWGQGTLVT,IGHV3-23,IGHJ4,393,264,4178.0,11.443243,17.698483,0.048545,3.0,0.000568,0.075,1.4e-05,9,0.045624
4,CARDMKGLNTFDYWGQGTLVT,IGHV3-48,IGHJ4,346,261,1360.0,10.863727,5.6962,0.046448,3.0,0.00397,0.069767,9.2e-05,7,0.017307


## Check if sum of nr of accessions is the same

In [11]:
print("Sum reads", clones['acc.nunique'].sum())
df['acc.nunique'].sum() == clones['acc.nunique'].sum()

Sum reads 44983


True

In [12]:
print("Sum UMIs", clones['beforeMID.nunique'].sum())
df['beforeMID.nunique'].sum() == clones['beforeMID.nunique'].sum()

Sum UMIs 34848


True

## Get nr of unique UMIs from the allinfo file

In [13]:
# Read allinfo file and apply quality filter
allinfo = pd.read_csv(allinfoFile, sep='\t')
allinfo = allinfo.loc[(allinfo['cdr3_qual_min'] >= 30) & (allinfo['V_sub'] != 'None') & (allinfo['J_sub'] != 'None') & ((allinfo['V_flag'] == '0') | (allinfo['V_flag'] == '16')) & ((allinfo['J_flag'] == '0') | (allinfo['J_flag'] == '16'))]

In [14]:
# Group the original entries by cdr3pep and J-gene
select = ['cdr3pep', 'V_sub', 'J_sub', 'acc', 'beforeMID']
cols = ['cdr3pep', 'J_sub']
clones_orig = allinfo[select].groupby(cols).agg({'beforeMID': 'nunique'})
clones_orig = clones_orig.sort_values(by='beforeMID', ascending=False)

In [15]:
# Reset index and rename the 'beforeMID' column to 'UMIs'
clones_orig = clones_orig.reset_index()
clones_orig = clones_orig.rename(columns={'beforeMID': 'UMIs'})
clones_orig.head()

Unnamed: 0,cdr3pep,J_sub,UMIs
0,CARDPNYYDLSGYSYNWFESWGQGTLVT,IGHJ5,876
1,CARGNSNGYYSDYWGQGILVT,IGHJ4,421
2,CARGSSVGTSPLDYWGQGTLVT,IGHJ4,347
3,CAKGNSGFYYDYWGQGTLVT,IGHJ4,226
4,CARDMKGLNTFDYWGQGTLVT,IGHJ4,208


In [16]:
# Merge clones with clones_orig to get the unique number of UMIs
clones_final = pd.merge(clones, clones_orig, how='inner', left_on=['cdr3pep','J_sub'], right_on=['cdr3pep','J_sub'])
clones_final = clones_final.sort_values(by='acc.nunique', ascending=False)

In [17]:
clones_final.head()

Unnamed: 0,cdr3pep,V_sub,J_sub,acc.nunique,beforeMID.nunique,mut.count_x.sum,mut.count_x.mean,mut.frac_x.sum,mut.frac_x.mean,mut.count_y.sum,mut.count_y.mean,mut.frac_y.sum,mut.frac_y.mean,nr_sites.sum,nr_sites.mean,UMIs
0,CARDPNYYDLSGYSYNWFESWGQGTLVT,IGHV3-21,IGHJ5,1257,986,19263.0,13.477084,80.644138,0.056758,2515.0,1.997867,49.365386,0.039224,1124,0.523617,876
1,CARGNSNGYYSDYWGQGILVT,IGHV4-59,IGHJ4,787,508,6203.0,9.225825,25.640163,0.038121,790.0,1.000453,19.764485,0.025018,21,0.085796,421
2,CARGSSVGTSPLDYWGQGTLVT,IGHV3-11,IGHJ4,454,385,12487.0,14.419085,49.239703,0.06076,2.0,0.000403,0.05,1e-05,14,0.103894,347
3,CAKGNSGFYYDYWGQGTLVT,IGHV3-23,IGHJ4,393,264,4178.0,11.443243,17.698483,0.048545,3.0,0.000568,0.075,1.4e-05,9,0.045624,226
4,CARDMKGLNTFDYWGQGTLVT,IGHV3-48,IGHJ4,346,261,1360.0,10.863727,5.6962,0.046448,3.0,0.00397,0.069767,9.2e-05,7,0.017307,208


In [18]:
print(len(clones), len(clones_orig), len(clones_final))

10260 10260 10260


In [19]:
# Write the clones to disk
clones_final.to_csv(outfile, sep='\t')
print("Wrote", outfile, "to disk")

Wrote TESTDATA/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites-reassigned.csv to disk
