In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("TESTDATA/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-clones-mut-sites.csv", sep="\t")
print(len(df))
df.head()

In [None]:
cols = ['cdr3pep', 'V_sub', 'acc.nunique']
cdr3pep_uniq = df[cols].groupby('cdr3pep').agg({'V_sub': 'nunique', 'acc.nunique': sum})
print(len(cdr3pep_uniq))

# get CDR3's with more than one V gene assigned
cdr3pep_uniq = cdr3pep_uniq.loc[cdr3pep_uniq['V_sub'] > 1]
print(len(cdr3pep_uniq))

cdr3pep_uniq.head()

In [None]:
def reAssign(df, peptide, threshold):
    '''
    Description: reassign genes with gene with the majority of the reads (or include more genes up to 70% of the reads)
    In: df with all information for one CDR3
    Out: new V gene name
    '''
    # get all clones for peptide
    df_tmp = df.loc[df['cdr3pep'] == peptide][['cdr3pep','V_sub','acc.nunique']]
    df_tmp = df_tmp.sort_values(by='acc.nunique', ascending=False)

    # calculate total frequency for this peptide
    total_freq = cdr3pep_uniq['acc.nunique'].loc[peptide]

    # store the cumulative sum
    df_tmp['cumsum'] = df_tmp['acc.nunique'].cumsum()

    # calculate the cumulative fraction
    df_tmp['cumsum_frac'] = df_tmp['cumsum'] / total_freq
    
    # select the genes that make up 70% of the reads for this peptide
    include_up_to = [ e > threshold for e in df_tmp['cumsum_frac'].tolist() ].index(True) + 1
    df_select = df_tmp.iloc[0:include_up_to]
    df_select = df_select.sort_values(by='V_sub', ascending=True)
    
    # concatenate the gene names
    v_gene = "+".join(df_select['V_sub'].tolist())
    
    # replace v name with new v name
    df.loc[df['cdr3pep'] == peptide, 'V_sub'] = v_gene

    return(df, v_gene)

## Loop through all CDR3s and re-assign the V gene

In [None]:
threshold = 0.7
for peptide in cdr3pep_uniq.index:
    (df, new_v_gene) = reAssign(df, peptide, threshold)
print("DONE")

In [None]:
df.loc[df['cdr3pep'] == peptide]

In [None]:
# Group the re-assigned entries
cols = ['cdr3pep', 'V_sub', 'J_sub']
clones = df.groupby(cols).agg({'acc.nunique': sum, 'beforeMID.nunique': sum, 'mut.count_x.sum': sum, 'mut.count_x.mean': np.mean, 'mut.frac_x.sum': sum, 'mut.frac_x.mean': np.mean, 'mut.count_y.sum': sum, 'mut.count_y.mean': np.mean, 'mut.frac_y.sum': sum, 'mut.frac_y.mean': np.mean, 'nr_sites.sum': sum, 'nr_sites.mean': np.mean})
clones = clones.sort_values(by='acc.nunique', ascending=False)
clones = clones.reset_index()

In [None]:
clones.head()

## Check if sum of nr of accessions is the same

In [None]:
print("Sum reads", clones['acc.nunique'].sum())
df['acc.nunique'].sum() == clones['acc.nunique'].sum()

In [None]:
print("Sum UMIs", clones['beforeMID.nunique'].sum())
df['beforeMID.nunique'].sum() == clones['beforeMID.nunique'].sum()

## Get nr of unique UMIs from the allinfo file

In [None]:
# Read allinfo file and apply quality filter
allinfo = pd.read_csv("final/B001-B_S83_L001.assembled-ACGTACGT-IGH_HUMAN-all_info.csv", sep='\t')
allinfo = allinfo.loc[(allinfo['cdr3_qual_min'] >= 30) & (allinfo['V_sub'] != 'None') & (allinfo['J_sub'] != 'None') & ((allinfo['V_flag'] == '0') | (allinfo['V_flag'] == '16')) & ((allinfo['J_flag'] == '0') | (allinfo['J_flag'] == '16'))]

In [None]:
# Group the original entries by cdr3pep and J-gene
select = ['cdr3pep', 'V_sub', 'J_sub', 'acc', 'beforeMID']
cols = ['cdr3pep', 'J_sub']
clones_orig = allinfo[select].groupby(cols).agg({'beforeMID': 'nunique'})
clones_orig = clones_orig.sort_values(by='beforeMID', ascending=False)

In [None]:
clones_orig = clones_orig.reset_index()
clones_orig = clones_orig.rename(columns={'beforeMID': 'UMIs'})
clones_orig.head()

In [None]:
# Merge clones with clones_orig for the correct number of UMIs
clones_final = pd.merge(clones, clones_orig, how='inner', left_on=['cdr3pep','J_sub'], right_on=['cdr3pep','J_sub'])
clones_final = clones_final.sort_values(by='acc.nunique', ascending=False)

In [None]:
clones_final.head()

In [None]:
print(len(clones), len(clones_orig), len(clones_final))