In [1]:
import pandas as pd
import numpy as np

## Input

In [2]:
allinfo_file = "final/correct-mid/Canomad-1-BuH_S231_L001.assembled-ATGCATGC-IGH_HUMAN-all_info.csv"
v_file = "Canomad-1-BuH_S231_L001.assembled-ATGCATGC-IGHV_human-e-clean.sam.mut.txt"
j_file = "Canomad-1-BuH_S231_L001.assembled-ATGCATGC-IGHJ_human-e-clean.sam.mut.txt"

## Output

In [3]:
outfile = allinfo_file.replace("-all_info.csv", "-clones-mut-sites.csv")
print(outfile)

final/correct-mid/Canomad-1-BuH_S231_L001.assembled-ATGCATGC-IGH_HUMAN-clones-mut-sites.csv


## Read files

In [4]:
allinfo = pd.read_csv(allinfo_file, sep='\t')
#print("allinfo entries:", len(allinfo))
#allinfo.head()

In [5]:
# Check if acc in allinfo is unique. ANSWER: no
#allinfo['acc'].nunique()

In [6]:
# Check properties of accs in allinfo that are not unique
#count_acc_allinfo = allinfo.groupby('acc').agg(['nunique'])
#col = 'V_sub'
#count_acc_allinfo.sort_values((col,'nunique'), ascending=False)[col].head()

In [7]:
# Replace 'None' with 0 for the nr_sites column
allinfo['nr_sites'] = allinfo['nr_sites'].replace('None', 0).apply(int)

In [8]:
v = pd.read_csv(v_file, sep=' ')
#print("v entries:", len(v))
#v.head()

In [9]:
# Check if acc in v is unique. ANSWER: nope
#v['acc'].nunique()

In [10]:
j = pd.read_csv(j_file, sep=' ')
#print("j entries:", len(j))
#j.head()

In [11]:
# Check if acc in j is unique. ANSWER: nope
#j['acc'].nunique()

In [12]:
# clean up the gene names
clean_name = lambda x: x.split("|")[1]
v['gene'] = [g for g in map(clean_name, v['gene'])]
j['gene'] = [g for g in map(clean_name, j['gene'])]

## Combine files

In [13]:
df = pd.merge(allinfo, v, how='left', left_on=['acc','V_gene'], right_on=['acc','gene'])
df = pd.merge(df, j, how='left', left_on=['acc','J_gene'], right_on=['acc','gene'])
#print("df merged enties:", len(df))
#df.head()

## Filter data

In [14]:
df = df.loc[(df['cdr3_qual_min'] >= 30) & (df['V_sub'] != 'None') & (df['J_sub'] != 'None') & ((df['V_flag'] == '0') | (df['V_flag'] == '16')) & ((df['J_flag'] == '0') | (df['J_flag'] == '16'))]
#print("df filtered entries:", len(df))
#df.head()

In [15]:
# Remove entries where the V and J alignments overlap each other
df = df.drop(df.loc[(df['start.pos_y']>df['start.pos_x']) & (df['start.pos_y']<df['end.pos_x']) | (df['end.pos_y']>df['start.pos_x']) & (df['end.pos_y']<df['end.pos_x'])].index)

In [16]:
# Select the alignment with the longest alignment length (V gene)
longest_alignment = df.groupby('acc').agg({'align.length_x': max})
longest_alignment = longest_alignment.reset_index()
#longest_alignment.head()

In [17]:
df = pd.merge(df, longest_alignment, how='inner', left_on=['acc','align.length_x'], right_on=['acc','align.length_x'])

In [18]:
# acc in df unique?
#print('Entries:', len(df))
#print('Unique:', df['acc'].nunique())

In [19]:
# Check why accessions are not unique
#tmp = df.groupby('acc').agg('nunique')
#tmp.apply(sum)

In [20]:
# Sort on nr of different cigar strings (descending)
#tmp = tmp.sort_values('cigar_y', ascending=False).head()

In [21]:
# Show first entry for inspection
#df.loc[df['acc']==tmp.index[0],['start.pos_x','end.pos_x','cigar_y','start.pos_y','end.pos_y','mut.count_y','align.length_y','align.seq_y']]

## Group data per clone (CDR3pep)

In [22]:
df.columns

Index(['acc', 'beforeMID', 'MID', 'afterMID', 'readingframe', 'cdr3pep',
       'cdr3nuc', 'cdr3_qual_min', 'cdr3_qual_max', 'cdr3_qual_avg',
       'cdr3_qual', 'nt_start', 'nt_end', 'seq_length', 'V_flag', 'V_gene',
       'J_flag', 'J_gene', 'readingframe_seq', 'seq', 'pep', 'qual', 'V_sub',
       'J_sub', 'V_main', 'acc:1', 'nr_v_mains', 'nr_v_subs', 'nr_v_alleles',
       'nr_j_subs', 'nr_j_alleles', 'acc:2', 'readingframe:1', 'nr_sites',
       'gene_x', 'cigar_x', 'start.pos_x', 'end.pos_x', 'mut.count_x',
       'mut.frac_x', 'align.length_x', 'align.seq_x', 'gene_y', 'cigar_y',
       'start.pos_y', 'end.pos_y', 'mut.count_y', 'mut.frac_y',
       'align.length_y', 'align.seq_y'],
      dtype='object')

In [23]:
clones = df.groupby(['cdr3pep','V_sub','J_sub']).agg({'acc': 'nunique', 'beforeMID': 'nunique', 'mut.count_x': [sum, np.mean], 'mut.frac_x': [sum, np.mean], 'mut.count_y': [sum, np.mean], 'mut.frac_y': [sum, np.mean], 'nr_sites': [sum, np.mean]})
clones = clones.sort_values(by=('acc','nunique'), ascending=False)
#print("Clones entries:", len(clones))
#clones.head()

In [24]:
clones.columns = ['.'.join(col).strip() for col in clones.columns.values]
#clones.head()

In [25]:
clones.to_csv(outfile, sep='\t')
print("Wrote", outfile, "to disk")

Wrote final/correct-mid/Canomad-1-BuH_S231_L001.assembled-ATGCATGC-IGH_HUMAN-clones-mut-sites.csv to disk
