In [None]:
import pandas as pd
import numpy as np

## Input

In [None]:
allinfo_file = "final/B001-B_S83_L001.assembled-ATGCATGC-IGH_HUMAN-all_info.csv"
v_file = "B001-B_S83_L001.assembled-ATGCATGC-IGHV_human-e-clean.sam.mut.txt"
j_file = "B001-B_S83_L001.assembled-ATGCATGC-IGHJ_human-e-clean.sam.mut.txt"

## Output

In [None]:
outfile = allinfo_file.replace("-all_info.csv", "-clones-mut-sites.csv")
print(outfile)

## Read files

In [None]:
allinfo = pd.read_csv(allinfo_file, sep='\t')
#print("allinfo entries:", len(allinfo))
#allinfo.head()

In [None]:
# Check if acc in allinfo is unique. ANSWER: no
#allinfo['acc'].nunique()

In [None]:
# Check properties of accs in allinfo that are not unique
#count_acc_allinfo = allinfo.groupby('acc').agg(['nunique'])
#col = 'V_sub'
#count_acc_allinfo.sort_values((col,'nunique'), ascending=False)[col].head()

In [None]:
# Replace 'None' with 0 for the nr_sites column
allinfo['nr_sites'] = allinfo['nr_sites'].replace('None', 0).apply(int)

In [None]:
v = pd.read_csv(v_file, sep=' ')
#print("v entries:", len(v))
#v.head()

In [None]:
# Check if acc in v is unique. ANSWER: nope
#v['acc'].nunique()

In [None]:
j = pd.read_csv(j_file, sep=' ')
#print("j entries:", len(j))
#j.head()

In [None]:
# Check if acc in j is unique. ANSWER: nope
#j['acc'].nunique()

In [None]:
# clean up the gene names
clean_name = lambda x: x.split("|")[1]
v['gene'] = [g for g in map(clean_name, v['gene'])]
j['gene'] = [g for g in map(clean_name, j['gene'])]

## Combine files

In [None]:
df = pd.merge(allinfo, v, how='left', left_on=['acc','V_gene'], right_on=['acc','gene'])
df = pd.merge(df, j, how='left', left_on=['acc','J_gene'], right_on=['acc','gene'])
#print("df merged enties:", len(df))
#df.head()

## Filter data

In [None]:
df = df.loc[(df['cdr3_qual_min'] >= 30) & (df['V_sub'] != 'None') & (df['J_sub'] != 'None') & ((df['V_flag'] == '0') | (df['V_flag'] == '16')) & ((df['J_flag'] == '0') | (df['J_flag'] == '16'))]
#print("df filtered entries:", len(df))
#df.head()

In [None]:
# Remove entries where the V and J alignments overlap each other
df = df.drop(df.loc[(df['start.pos_y']>df['start.pos_x']) & (df['start.pos_y']<df['end.pos_x']) | (df['end.pos_y']>df['start.pos_x']) & (df['end.pos_y']<df['end.pos_x'])].index)

In [None]:
# Select the alignment with the longest alignment length (V gene)
longest_alignment = df.groupby('acc').agg({'align.length_x': max})
longest_alignment = longest_alignment.reset_index()
#longest_alignment.head()

In [None]:
df = pd.merge(df, longest_alignment, how='inner', left_on=['acc','align.length_x'], right_on=['acc','align.length_x'])

In [None]:
# acc in df unique?
#print('Entries:', len(df))
#print('Unique:', df['acc'].nunique())

In [None]:
# Check why accessions are not unique
#tmp = df.groupby('acc').agg('nunique')
#tmp.apply(sum)

In [None]:
# Sort on nr of different cigar strings (descending)
#tmp = tmp.sort_values('cigar_y', ascending=False).head()

In [None]:
# Show first entry for inspection
#df.loc[df['acc']==tmp.index[0],['start.pos_x','end.pos_x','cigar_y','start.pos_y','end.pos_y','mut.count_y','align.length_y','align.seq_y']]

## Group data per clone (CDR3pep)

In [None]:
df.columns

In [None]:
clones = df.groupby(['cdr3pep','V_sub','J_sub']).agg({'acc': 'nunique', 'beforeMID': 'nunique', 'mut.count_x': [sum, np.mean], 'mut.frac_x': [sum, np.mean], 'mut.count_y': [sum, np.mean], 'mut.frac_y': [sum, np.mean], 'nr_sites': [sum, np.mean]})
clones = clones.sort_values(by=('acc','nunique'), ascending=False)
#print("Clones entries:", len(clones))
#clones.head()

In [None]:
clones.columns = ['.'.join(col).strip() for col in clones.columns.values]
#clones.head()

In [None]:
clones.to_csv(outfile, sep='\t')
print("Wrote", outfile, "to disk")