## import stuff

In [1]:
#importing things
import pandas as pd
import numpy as np
import csv
import pylab as pl
import matplotlib.pyplot as plt
import copy
import scipy
from scipy.stats import linregress
from scipy.stats import t
from scipy.stats import sem
from scipy.stats import chi2
from scipy.stats import ttest_ind_from_stats
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind_from_stats
from collections import OrderedDict
import seaborn as sns

## fetch the data

In [2]:
#read in data, remove genotypes that are missing from ch65a
ch65a = pd.DataFrame(pd.read_csv('Kds_CH65a_f3jrem.csv',delimiter=',',dtype={'geno': str})) #here spurious counts for the missing fragment are removed
ch65b = pd.DataFrame(pd.read_csv('Kds_CH65b.tsv',delimiter='\t',dtype={'geno': str}))
ch65d = pd.DataFrame(pd.read_csv('Kds_CH65d.tsv',delimiter='\t',dtype={'geno': str}))
ch65e = pd.DataFrame(pd.read_csv('Kds_CH65e.tsv',delimiter='\t',dtype={'geno': str}))
ch65g = pd.DataFrame(pd.read_csv('Kds_CH65g.tsv',delimiter='\t',dtype={'geno': str}))
ch65h = pd.DataFrame(pd.read_csv('Kds_CH65h.tsv',delimiter='\t',dtype={'geno': str}))

## merging & averaging unfiltered data

In [4]:
#merge by antigen
#MA90
MA90 = pd.DataFrame()
MA90 = pd.merge(ch65a, ch65b, on='geno',how='outer')
#SI06
SI06 = pd.DataFrame()
SI06 = pd.merge(ch65d, ch65e, on='geno',how='outer')
#G189E
G189E = pd.DataFrame()
G189E = pd.merge(ch65g, ch65h, on='geno',how='outer')

In [5]:
#cleanup & calculate mean and SEM
#MA90
MA90_abbrev = pd.DataFrame()
MA90_abbrev = MA90[['geno','log10Kd_x','log10Kd_y']].copy()
MA90_abbrev['mean'] = MA90_abbrev.mean(axis=1)
MA90_abbrev['sem'] = MA90_abbrev.sem(axis=1)
MA90_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb","mean": "MA90_mean", "sem": "MA90_sem"},inplace=True)
MA90_abbrev.head()

  """
  


Unnamed: 0,geno,repa,repb,MA90_mean,MA90_sem
0,0,8.612721,8.47232,8.542521,0.04053
1,1,8.63198,8.516378,8.574179,0.033371
2,10,8.340245,8.29079,8.315518,0.014276
3,11,8.511183,8.436571,8.473877,0.021539
4,100,8.552056,8.523392,8.537724,0.008275


In [6]:
#SI06
SI06_abbrev = pd.DataFrame()
SI06_abbrev = SI06[['geno','log10Kd_x','log10Kd_y']].copy()
SI06_abbrev['mean'] = SI06_abbrev.mean(axis=1)
SI06_abbrev['sem'] = SI06_abbrev.sem(axis=1)
SI06_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "SI06_mean", "sem": "SI06_sem"},inplace=True)
SI06_abbrev.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,geno,repa,repb,SI06_mean,SI06_sem
0,0,5.0,5.0,5.0,2.28206e-14
1,1,5.0,5.0,5.0,0.0
2,10,5.0,5.0,5.0,0.0
3,11,5.0,5.0,5.0,1.341254e-11
4,100,5.0,5.0,5.0,3.854462e-15


In [7]:
#G189E
G189E_abbrev = pd.DataFrame()
G189E_abbrev = G189E[['geno','log10Kd_x','log10Kd_y']].copy()
G189E_abbrev['mean'] = G189E_abbrev.mean(axis=1)
G189E_abbrev['sem'] = G189E_abbrev.sem(axis=1)
G189E_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "G189E_mean", "sem": "G189E_sem"},inplace=True)
G189E_abbrev.head()

  after removing the cwd from sys.path.
  """


Unnamed: 0,geno,repa,repb,G189E_mean,G189E_sem
0,0,6.651893,6.651835,6.651864,1.7e-05
1,1,5.0,6.171406,5.585703,0.338156
2,10,5.0,5.376927,5.188464,0.108809
3,11,6.415592,6.278565,6.347078,0.039556
4,100,6.039586,6.095005,6.067296,0.015998


## Pinning Kd to boundaries, removing poor fits, merging replicates

In [11]:
#CH65A -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65a_filt = ch65a.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65a_filt.log10Kd < 6)|((ch65a_filt['mean_log10PE1'] < (1 + ch65a_filt['mean_log10PE0'])))
ch65a_filt.loc[boolean_condition, column_name] = new_value
ch65a_filt_out = ch65a_filt.loc[((ch65a_filt['log10Kd'] > 6) & (ch65a_filt['sigma'] <= 1) & (ch65a_filt['r2'] >= 0.8))|(ch65a_filt['log10Kd'] <= 6)]
print(ch65a['log10Kd'].mean(),ch65a_filt_out['log10Kd'].mean())
ch65a_filt_out.shape[0]

9.395241171076515 9.395160268993848


61424

In [12]:
#CH65B -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65b_filt = ch65b.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65b_filt.log10Kd < 6)|((ch65b_filt['mean_log10PE1'] < (1 + ch65b_filt['mean_log10PE0'])))
ch65b_filt.loc[boolean_condition, column_name] = new_value
ch65b_filt_out = ch65b_filt.loc[((ch65b_filt['log10Kd'] > 6) & (ch65b_filt['sigma'] <= 1) & (ch65b_filt['r2'] >= 0.8))|(ch65b_filt['log10Kd'] <= 6)]
print(ch65b['log10Kd'].mean(),ch65b_filt_out['log10Kd'].mean())
ch65b_filt_out.shape[0]

9.379108218013254 9.379099644042324


65522

In [13]:
#CH65D -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65d_filt = ch65d.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65d_filt.log10Kd < 6)|((ch65d_filt['mean_log10PE1'] < (1 + ch65d_filt['mean_log10PE0'])))
ch65d_filt.loc[boolean_condition, column_name] = new_value
ch65d_filt_out = ch65d_filt.loc[((ch65d_filt['log10Kd'] > 6) & (ch65d_filt['sigma'] <= 1) & (ch65d_filt['r2'] >= 0.8))|(ch65d_filt['log10Kd'] <= 6)]
print(ch65d['log10Kd'].mean(),ch65d_filt_out['log10Kd'].mean())
ch65d_filt_out.shape[0]

6.117678780048976 6.6740596010091


65397

In [14]:
#CH65E -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65e_filt = ch65e.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65e_filt.log10Kd < 6)|((ch65e_filt['mean_log10PE1'] < (1 + ch65e_filt['mean_log10PE0'])))
ch65e_filt.loc[boolean_condition, column_name] = new_value
ch65e_filt_out = ch65e_filt.loc[((ch65e_filt['log10Kd'] > 6) & (ch65e_filt['sigma'] <= 1) & (ch65e_filt['r2'] >= 0.8))|(ch65e_filt['log10Kd'] <= 6)]
print(ch65e['log10Kd'].mean(),ch65e_filt_out['log10Kd'].mean())
ch65e_filt_out.shape[0]

6.168214719931285 6.715010187463439


65460

In [15]:
#ch65g -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65g_filt = ch65g.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65g_filt.log10Kd < 6)|((ch65g_filt['mean_log10PE1'] < (1 + ch65g_filt['mean_log10PE0'])))
ch65g_filt.loc[boolean_condition, column_name] = new_value
ch65g_filt_out = ch65g_filt.loc[((ch65g_filt['log10Kd'] > 6) & (ch65g_filt['sigma'] <= 1) & (ch65g_filt['r2'] >= 0.8))|(ch65g_filt['log10Kd'] <= 6)]
print(ch65g['log10Kd'].mean(),ch65g_filt_out['log10Kd'].mean())
ch65g_filt_out.shape[0]

8.293162365445946 8.25452675972862


64677

In [16]:
#ch65h -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65h_filt = ch65h.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65h_filt.log10Kd < 6)|((ch65h_filt['mean_log10PE1'] < (1 + ch65h_filt['mean_log10PE0'])))
ch65h_filt.loc[boolean_condition, column_name] = new_value
ch65h_filt_out = ch65h_filt.loc[((ch65h_filt['log10Kd'] > 6) & (ch65h_filt['sigma'] <= 1) & (ch65h_filt['r2'] >= 0.8))|(ch65h_filt['log10Kd'] <= 6)]
print(ch65h['log10Kd'].mean(),ch65h_filt_out['log10Kd'].mean())
ch65h_filt_out.shape[0]

8.352648066440839 8.282124527815451


64429

In [17]:
#MA90 merging
MA90_filt = pd.DataFrame()
MA90_filt = pd.merge(ch65a_filt_out, ch65b_filt_out, on='geno',how='outer')
MA90_filt_abbrev = MA90_filt[['geno','log10Kd_x','log10Kd_y']].copy()
MA90_filt_abbrev['mean'] = MA90_filt_abbrev.mean(axis=1)
MA90_filt_abbrev['sem'] = MA90_filt_abbrev.sem(axis=1)
MA90_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "MA90_mean", "sem": "MA90_sem"},inplace=True)
MA90_filt_abbrev.head()

  """
  


Unnamed: 0,geno,repa,repb,MA90_mean,MA90_sem
0,0,8.612721,8.47232,8.542521,0.04053
1,1,8.63198,8.516378,8.574179,0.033371
2,10,8.340245,8.29079,8.315518,0.014276
3,11,8.511183,8.436571,8.473877,0.021539
4,100,8.552056,8.523392,8.537724,0.008275


In [18]:
#SI06 merging
SI06_filt = pd.DataFrame()
SI06_filt = pd.merge(ch65d_filt_out, ch65e_filt_out, on='geno',how='outer')
SI06_filt_abbrev = SI06_filt[['geno','log10Kd_x','log10Kd_y']].copy()
SI06_filt_abbrev['mean'] = SI06_filt_abbrev.mean(axis=1)
SI06_filt_abbrev['sem'] = SI06_filt_abbrev.sem(axis=1)
SI06_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "SI06_mean", "sem": "SI06_sem"},inplace=True)
SI06_filt_abbrev.head()

  """
  


Unnamed: 0,geno,repa,repb,SI06_mean,SI06_sem
0,0,6.0,6.0,6.0,0.0
1,1,6.0,6.0,6.0,0.0
2,10,6.0,6.0,6.0,0.0
3,11,6.0,6.0,6.0,0.0
4,100,6.0,6.0,6.0,0.0


In [19]:
#G189E merging
G189E_filt = pd.DataFrame()
G189E_filt = pd.merge(ch65g_filt_out, ch65h_filt_out, on='geno',how='outer')
G189E_filt_abbrev = G189E_filt[['geno','log10Kd_x','log10Kd_y']].copy()
G189E_filt_abbrev['mean'] = G189E_filt_abbrev.mean(axis=1)
G189E_filt_abbrev['sem'] = G189E_filt_abbrev.sem(axis=1)
G189E_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "G189E_mean", "sem": "G189E_sem"},inplace=True)
G189E_filt_abbrev.head()

  """
  


Unnamed: 0,geno,repa,repb,G189E_mean,G189E_sem
0,0,6.0,6.0,6.0,0.0
1,1,6.0,6.0,6.0,0.0
2,10,6.0,6.0,6.0,0.0
3,11,6.0,6.0,6.0,0.0
4,100,6.0,6.0,6.0,0.0


## functions for annotating dataframe

In [23]:
#define categories based on mutation position
def find_pos_1(variant):
    pos = [mut for mut in variant]
    if pos[0] == "1":
        return 1
    else:
        return 0
def find_pos_2(variant):
    pos = [mut for mut in variant]
    if pos[1] == "1":
        return 1
    else:
        return 0
def find_pos_3(variant):
    pos = [mut for mut in variant]
    if pos[2] == "1":
        return 1
    else:
        return 0
def find_pos_4(variant):
    pos = [mut for mut in variant]
    if pos[3] == "1":
        return 1
    else:
        return 0
def find_pos_5(variant):
    pos = [mut for mut in variant]
    if pos[4] == "1":
        return 1
    else:
        return 0
def find_pos_6(variant):
    pos = [mut for mut in variant]
    if pos[5] == "1":
        return 1
    else:
        return 0
def find_pos_7(variant):
    pos = [mut for mut in variant]
    if pos[6] == "1":
        return 1
    else:
        return 0
def find_pos_8(variant):
    pos = [mut for mut in variant]
    if pos[7] == "1":
        return 1
    else:
        return 0
def find_pos_9(variant):
    pos = [mut for mut in variant]
    if pos[8] == "1":
        return 1
    else:
        return 0
def find_pos_10(variant):
    pos = [mut for mut in variant]
    if pos[9 ] == "1":
        return 1
    else:
        return 0
def find_pos_11(variant):
    pos = [mut for mut in variant]
    if pos[10] == "1":
        return 1
    else:
        return 0
def find_pos_12(variant):
    pos = [mut for mut in variant]
    if pos[11] == "1":
        return 1
    else:
        return 0
def find_pos_13(variant):
    pos = [mut for mut in variant]
    if pos[12] == "1":
        return 1
    else:
        return 0
def find_pos_14(variant):
    pos = [mut for mut in variant]
    if pos[13] == "1":
        return 1
    else:
        return 0
def find_pos_15(variant):
    pos = [mut for mut in variant]
    if pos[14] == "1":
        return 1
    else:
        return 0
def find_pos_16(variant):
    pos = [mut for mut in variant]
    if pos[15] == "1":
        return 1
    else:
        return 0
    
#add a column with the number of mutations and save the file
def sum_digits(digit):
    return sum(int(x) for x in digit if x.isdigit())

## merging antigens for output: unfiltered, unadjusted data

In [24]:
#merge all antigens
CH65_all_unf = pd.DataFrame()
CH65_all_unf = pd.merge(MA90_abbrev, SI06_abbrev, on='geno',how='outer')
CH65_all_unf = pd.merge(CH65_all_unf, G189E_abbrev, on='geno',how='outer')

In [25]:
#define categories based on mutation position
CH65_all_unf['pos1'] = CH65_all_unf['geno'].apply(find_pos_1)
CH65_all_unf['pos2'] = CH65_all_unf['geno'].apply(find_pos_2)
CH65_all_unf['pos3'] = CH65_all_unf['geno'].apply(find_pos_3)
CH65_all_unf['pos4'] = CH65_all_unf['geno'].apply(find_pos_4)
CH65_all_unf['pos5'] = CH65_all_unf['geno'].apply(find_pos_5)
CH65_all_unf['pos6'] = CH65_all_unf['geno'].apply(find_pos_6)
CH65_all_unf['pos7'] = CH65_all_unf['geno'].apply(find_pos_7)
CH65_all_unf['pos8'] = CH65_all_unf['geno'].apply(find_pos_8)
CH65_all_unf['pos9'] = CH65_all_unf['geno'].apply(find_pos_9)
CH65_all_unf['pos10'] = CH65_all_unf['geno'].apply(find_pos_10)
CH65_all_unf['pos11'] = CH65_all_unf['geno'].apply(find_pos_11)
CH65_all_unf['pos12'] = CH65_all_unf['geno'].apply(find_pos_12)
CH65_all_unf['pos13'] = CH65_all_unf['geno'].apply(find_pos_13)
CH65_all_unf['pos14'] = CH65_all_unf['geno'].apply(find_pos_14)
CH65_all_unf['pos15'] = CH65_all_unf['geno'].apply(find_pos_15)
CH65_all_unf['pos16'] = CH65_all_unf['geno'].apply(find_pos_16)

#add a column with the number of mutations and save the file
CH65_all_unf['som_mut'] = CH65_all_unf['geno'].apply(sum_digits)
CH65_all_unf.to_csv('../Kd_processed/20220601_CH65_unfilt.csv', index=False)

## merging antigens for output: filtered, adjusted data

In [26]:
#merge all antigens
CH65_all_filt = pd.DataFrame()
CH65_all_filt = pd.merge(MA90_filt_abbrev, SI06_filt_abbrev, on='geno',how='outer')
CH65_all_filt = pd.merge(CH65_all_filt, G189E_filt_abbrev, on='geno',how='outer')

In [27]:
#define categories based on mutation position
CH65_all_filt['pos1'] = CH65_all_filt['geno'].apply(find_pos_1)
CH65_all_filt['pos2'] = CH65_all_filt['geno'].apply(find_pos_2)
CH65_all_filt['pos3'] = CH65_all_filt['geno'].apply(find_pos_3)
CH65_all_filt['pos4'] = CH65_all_filt['geno'].apply(find_pos_4)
CH65_all_filt['pos5'] = CH65_all_filt['geno'].apply(find_pos_5)
CH65_all_filt['pos6'] = CH65_all_filt['geno'].apply(find_pos_6)
CH65_all_filt['pos7'] = CH65_all_filt['geno'].apply(find_pos_7)
CH65_all_filt['pos8'] = CH65_all_filt['geno'].apply(find_pos_8)
CH65_all_filt['pos9'] = CH65_all_filt['geno'].apply(find_pos_9)
CH65_all_filt['pos10'] = CH65_all_filt['geno'].apply(find_pos_10)
CH65_all_filt['pos11'] = CH65_all_filt['geno'].apply(find_pos_11)
CH65_all_filt['pos12'] = CH65_all_filt['geno'].apply(find_pos_12)
CH65_all_filt['pos13'] = CH65_all_filt['geno'].apply(find_pos_13)
CH65_all_filt['pos14'] = CH65_all_filt['geno'].apply(find_pos_14)
CH65_all_filt['pos15'] = CH65_all_filt['geno'].apply(find_pos_15)
CH65_all_filt['pos16'] = CH65_all_filt['geno'].apply(find_pos_16)

#add a column with the number of mutations and save the file
CH65_all_filt['som_mut'] = CH65_all_filt['geno'].apply(sum_digits)
CH65_all_filt.to_csv('../Kd_processed/20220601_CH65_filt.csv', index=False)