## import stuff

In [1]:
#importing things
import pandas as pd
import numpy as np
import csv
import pylab as pl
import matplotlib.pyplot as plt
import copy
import scipy
from scipy.stats import linregress
from scipy.stats import t
from scipy.stats import sem
from scipy.stats import chi2
from scipy.stats import ttest_ind_from_stats
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind_from_stats
from collections import OrderedDict
import seaborn as sns

## fetch the data

In [2]:
#read in data, remove genotypes that are missing from ch65a
ch65a = pd.DataFrame(pd.read_csv('Kds_CH65a_f3jrem.csv',delimiter=',',dtype={'geno': str})) #here spurious counts for the missing fragment are removed
ch65b = pd.DataFrame(pd.read_csv('Kds_CH65b.tsv',delimiter='\t',dtype={'geno': str}))
ch65d = pd.DataFrame(pd.read_csv('Kds_CH65d.tsv',delimiter='\t',dtype={'geno': str}))
ch65e = pd.DataFrame(pd.read_csv('Kds_CH65e.tsv',delimiter='\t',dtype={'geno': str}))
ch65g = pd.DataFrame(pd.read_csv('Kds_CH65g.tsv',delimiter='\t',dtype={'geno': str}))
ch65h = pd.DataFrame(pd.read_csv('Kds_CH65h.tsv',delimiter='\t',dtype={'geno': str}))

## Call nonbinders & remove poor fits for each replicate

In [3]:
#CH65A -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65a_filt = ch65a.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65a_filt.log10Kd < 6)|((ch65a_filt['mean_log10PE1'] < (1 + ch65a_filt['mean_log10PE0'])))
ch65a_filt.loc[boolean_condition, column_name] = new_value
ch65a_filt_out = ch65a_filt.loc[((ch65a_filt['log10Kd'] > 6) & (ch65a_filt['sigma'] <= 1) & (ch65a_filt['r2'] >= 0.8))|(ch65a_filt['log10Kd'] <= 6)]
print(ch65a['log10Kd'].mean(),ch65a_filt_out['log10Kd'].mean())
ch65a_filt_out.shape[0]

9.395241171076515 9.395160268993848


61424

In [4]:
#CH65B -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65b_filt = ch65b.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65b_filt.log10Kd < 6)|((ch65b_filt['mean_log10PE1'] < (1 + ch65b_filt['mean_log10PE0'])))
ch65b_filt.loc[boolean_condition, column_name] = new_value
ch65b_filt_out = ch65b_filt.loc[((ch65b_filt['log10Kd'] > 6) & (ch65b_filt['sigma'] <= 1) & (ch65b_filt['r2'] >= 0.8))|(ch65b_filt['log10Kd'] <= 6)]
print(ch65b['log10Kd'].mean(),ch65b_filt_out['log10Kd'].mean())
ch65b_filt_out.shape[0]

9.376441104947144 9.376429655575732


65523

In [5]:
#CH65D -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65d_filt = ch65d.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65d_filt.log10Kd < 6)|((ch65d_filt['mean_log10PE1'] < (1 + ch65d_filt['mean_log10PE0'])))
ch65d_filt.loc[boolean_condition, column_name] = new_value
ch65d_filt_out = ch65d_filt.loc[((ch65d_filt['log10Kd'] > 6) & (ch65d_filt['sigma'] <= 1) & (ch65d_filt['r2'] >= 0.8))|(ch65d_filt['log10Kd'] <= 6)]
print(ch65d['log10Kd'].mean(),ch65d_filt_out['log10Kd'].mean())
ch65d_filt_out.shape[0]

6.713479263377589 6.870817369774956


64956

In [6]:
#CH65E -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65e_filt = ch65e.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65e_filt.log10Kd < 6)|((ch65e_filt['mean_log10PE1'] < (1 + ch65e_filt['mean_log10PE0'])))
ch65e_filt.loc[boolean_condition, column_name] = new_value
ch65e_filt_out = ch65e_filt.loc[((ch65e_filt['log10Kd'] > 6) & (ch65e_filt['sigma'] <= 1) & (ch65e_filt['r2'] >= 0.8))|(ch65e_filt['log10Kd'] <= 6)]
print(ch65e['log10Kd'].mean(),ch65e_filt_out['log10Kd'].mean())
ch65e_filt_out.shape[0]

6.680042912309914 6.90658643574531


65295

In [7]:
#ch65g -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65g_filt = ch65g.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65g_filt.log10Kd < 6)|((ch65g_filt['mean_log10PE1'] < (1 + ch65g_filt['mean_log10PE0'])))
ch65g_filt.loc[boolean_condition, column_name] = new_value
ch65g_filt_out = ch65g_filt.loc[((ch65g_filt['log10Kd'] > 6) & (ch65g_filt['sigma'] <= 1) & (ch65g_filt['r2'] >= 0.8))|(ch65g_filt['log10Kd'] <= 6)]
print(ch65g['log10Kd'].mean(),ch65g_filt_out['log10Kd'].mean())
ch65g_filt_out.shape[0]

8.611806707898335 8.375193632260613


65527

In [8]:
#ch65h -- replacing Kd below the boundary & nonbinders (shallow curve) with the boundary & then filter out all poor fits for binders
ch65h_filt = ch65h.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65h_filt.log10Kd < 6)|((ch65h_filt['mean_log10PE1'] < (1 + ch65h_filt['mean_log10PE0'])))
ch65h_filt.loc[boolean_condition, column_name] = new_value
ch65h_filt_out = ch65h_filt.loc[((ch65h_filt['log10Kd'] > 6) & (ch65h_filt['sigma'] <= 1) & (ch65h_filt['r2'] >= 0.8))|(ch65h_filt['log10Kd'] <= 6)]
print(ch65h['log10Kd'].mean(),ch65h_filt_out['log10Kd'].mean())
ch65h_filt_out.shape[0]

8.637615326664527 8.392767180662995


65461

In [9]:
#MA90 merging
MA90_filt = pd.DataFrame()
MA90_filt = pd.merge(ch65a_filt_out, ch65b_filt_out, on='geno',how='outer')
MA90_filt_abbrev = MA90_filt[['geno','log10Kd_x','log10Kd_y']].copy()
MA90_filt_abbrev['mean'] = MA90_filt_abbrev[['log10Kd_x','log10Kd_y']].mean(axis=1)
MA90_filt_abbrev['sem'] = MA90_filt_abbrev[['log10Kd_x','log10Kd_y']].sem(axis=1)
MA90_filt_abbrev['sem'] = MA90_filt_abbrev['sem'].fillna(0)
MA90_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "MA90_mean", "sem": "MA90_sem"},inplace=True)
MA90_filt_abbrev

Unnamed: 0,geno,repa,repb,MA90_mean,MA90_sem
0,0000000000000000,8.612721,8.490826,8.551773,0.060948
1,0000000000000001,8.631980,8.533384,8.582682,0.049298
2,0000000000000010,8.340245,8.305612,8.322928,0.017317
3,0000000000000011,8.511183,8.454330,8.482757,0.028426
4,0000000000000100,8.552056,8.546097,8.549076,0.002980
...,...,...,...,...,...
65528,1111110101111011,,9.985969,9.985969,0.000000
65529,1111110101111100,,9.893202,9.893202,0.000000
65530,1111110101111101,,9.875471,9.875471,0.000000
65531,1111110101111110,,9.874861,9.874861,0.000000


In [10]:
#SI06 merging
SI06_filt = pd.DataFrame()
SI06_filt = pd.merge(ch65d_filt_out, ch65e_filt_out, on='geno',how='outer')
SI06_filt_abbrev = SI06_filt[['geno','log10Kd_x','log10Kd_y']].copy()
SI06_filt_abbrev['mean'] = SI06_filt_abbrev[['log10Kd_x','log10Kd_y']].mean(axis=1)
SI06_filt_abbrev['sem'] = SI06_filt_abbrev[['log10Kd_x','log10Kd_y']].sem(axis=1)
SI06_filt_abbrev['sem'] = SI06_filt_abbrev['sem'].fillna(0)
SI06_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "SI06_mean", "sem": "SI06_sem"},inplace=True)
SI06_filt_abbrev.head()

Unnamed: 0,geno,repa,repb,SI06_mean,SI06_sem
0,0,6.0,6.0,6.0,0.0
1,1,6.0,6.0,6.0,0.0
2,10,6.0,6.0,6.0,0.0
3,11,6.0,6.0,6.0,0.0
4,100,6.0,6.0,6.0,0.0


In [11]:
#G189E merging
G189E_filt = pd.DataFrame()
G189E_filt = pd.merge(ch65g_filt_out, ch65h_filt_out, on='geno',how='outer')
G189E_filt_abbrev = G189E_filt[['geno','log10Kd_x','log10Kd_y']].copy()
G189E_filt_abbrev['mean'] = G189E_filt_abbrev[['log10Kd_x','log10Kd_y']].mean(axis=1)
G189E_filt_abbrev['sem'] = G189E_filt_abbrev[['log10Kd_x','log10Kd_y']].sem(axis=1)
G189E_filt_abbrev['sem'] = G189E_filt_abbrev['sem'].fillna(0)
G189E_filt_abbrev.rename(columns={"log10Kd_x": "repa", "log10Kd_y": "repb", "mean": "G189E_mean", "sem": "G189E_sem"},inplace=True)
G189E_filt_abbrev.head()

Unnamed: 0,geno,repa,repb,G189E_mean,G189E_sem
0,0,6.0,6.0,6.0,0.0
1,1,6.0,6.0,6.0,0.0
2,10,6.0,6.0,6.0,0.0
3,11,6.0,6.0,6.0,0.0
4,100,6.0,6.0,6.0,0.0


## functions for annotating dataframe

In [12]:
#define categories based on mutation position
def find_pos_1(variant):
    pos = [mut for mut in variant]
    if pos[0] == "1":
        return 1
    else:
        return 0
def find_pos_2(variant):
    pos = [mut for mut in variant]
    if pos[1] == "1":
        return 1
    else:
        return 0
def find_pos_3(variant):
    pos = [mut for mut in variant]
    if pos[2] == "1":
        return 1
    else:
        return 0
def find_pos_4(variant):
    pos = [mut for mut in variant]
    if pos[3] == "1":
        return 1
    else:
        return 0
def find_pos_5(variant):
    pos = [mut for mut in variant]
    if pos[4] == "1":
        return 1
    else:
        return 0
def find_pos_6(variant):
    pos = [mut for mut in variant]
    if pos[5] == "1":
        return 1
    else:
        return 0
def find_pos_7(variant):
    pos = [mut for mut in variant]
    if pos[6] == "1":
        return 1
    else:
        return 0
def find_pos_8(variant):
    pos = [mut for mut in variant]
    if pos[7] == "1":
        return 1
    else:
        return 0
def find_pos_9(variant):
    pos = [mut for mut in variant]
    if pos[8] == "1":
        return 1
    else:
        return 0
def find_pos_10(variant):
    pos = [mut for mut in variant]
    if pos[9] == "1":
        return 1
    else:
        return 0
def find_pos_11(variant):
    pos = [mut for mut in variant]
    if pos[10] == "1":
        return 1
    else:
        return 0
def find_pos_12(variant):
    pos = [mut for mut in variant]
    if pos[11] == "1":
        return 1
    else:
        return 0
def find_pos_13(variant):
    pos = [mut for mut in variant]
    if pos[12] == "1":
        return 1
    else:
        return 0
def find_pos_14(variant):
    pos = [mut for mut in variant]
    if pos[13] == "1":
        return 1
    else:
        return 0
def find_pos_15(variant):
    pos = [mut for mut in variant]
    if pos[14] == "1":
        return 1
    else:
        return 0
def find_pos_16(variant):
    pos = [mut for mut in variant]
    if pos[15] == "1":
        return 1
    else:
        return 0
    
#add a column with the number of mutations and save the file
def sum_digits(digit):
    return sum(int(x) for x in digit if x.isdigit())

## merging antigens for output: filtered, adjusted data

In [17]:
#merge all antigens
CH65_all_filt = pd.DataFrame()
CH65_all_filt = pd.merge(MA90_filt_abbrev, SI06_filt_abbrev, on='geno',how='outer')
CH65_all_filt = pd.merge(CH65_all_filt, G189E_filt_abbrev, on='geno',how='outer')

In [18]:
#define categories based on mutation position
CH65_all_filt['pos1'] = CH65_all_filt['geno'].apply(find_pos_1)
CH65_all_filt['pos2'] = CH65_all_filt['geno'].apply(find_pos_2)
CH65_all_filt['pos3'] = CH65_all_filt['geno'].apply(find_pos_3)
CH65_all_filt['pos4'] = CH65_all_filt['geno'].apply(find_pos_4)
CH65_all_filt['pos5'] = CH65_all_filt['geno'].apply(find_pos_5)
CH65_all_filt['pos6'] = CH65_all_filt['geno'].apply(find_pos_6)
CH65_all_filt['pos7'] = CH65_all_filt['geno'].apply(find_pos_7)
CH65_all_filt['pos8'] = CH65_all_filt['geno'].apply(find_pos_8)
CH65_all_filt['pos9'] = CH65_all_filt['geno'].apply(find_pos_9)
CH65_all_filt['pos10'] = CH65_all_filt['geno'].apply(find_pos_10)
CH65_all_filt['pos11'] = CH65_all_filt['geno'].apply(find_pos_11)
CH65_all_filt['pos12'] = CH65_all_filt['geno'].apply(find_pos_12)
CH65_all_filt['pos13'] = CH65_all_filt['geno'].apply(find_pos_13)
CH65_all_filt['pos14'] = CH65_all_filt['geno'].apply(find_pos_14)
CH65_all_filt['pos15'] = CH65_all_filt['geno'].apply(find_pos_15)
CH65_all_filt['pos16'] = CH65_all_filt['geno'].apply(find_pos_16)

#add a column with the number of mutations and save the file
CH65_all_filt['som_mut'] = CH65_all_filt['geno'].apply(sum_digits)
CH65_all_filt.to_csv('../Kd_processed/20221008_CH65_QCfilt.csv', index=False)

## remove genotypes with poor replicate data

In [19]:
## if SEM > 0.5, throw out (> 1-log difference in Kd)
MA90_filt_abbrev_rep = MA90_filt_abbrev[MA90_filt_abbrev['MA90_sem']<0.5].copy()
print(MA90_filt.shape[0],MA90_filt_abbrev_rep.shape[0])
SI06_filt_abbrev_rep = SI06_filt_abbrev[SI06_filt_abbrev['SI06_sem']<0.5].copy()
print(SI06_filt.shape[0],SI06_filt_abbrev_rep.shape[0])
G189E_filt_abbrev_rep = G189E_filt_abbrev[G189E_filt_abbrev['G189E_sem']<0.5].copy()
print(G189E_filt.shape[0],G189E_filt_abbrev_rep.shape[0])

65533 65530
65514 64619
65536 63840


# merge antigens after replicate-based filtering

In [20]:
CH65_all_filt_rep = pd.DataFrame()
CH65_all_filt_rep = pd.merge(MA90_filt_abbrev_rep, SI06_filt_abbrev_rep, on='geno',how='outer')
CH65_all_filt_rep = pd.merge(CH65_all_filt_rep, G189E_filt_abbrev_rep, on='geno',how='outer')
CH65_all_filt_rep

Unnamed: 0,geno,repa_x,repb_x,MA90_mean,MA90_sem,repa_y,repb_y,SI06_mean,SI06_sem,repa,repb,G189E_mean,G189E_sem
0,0000000000000000,8.612721,8.490826,8.551773,0.060948,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
1,0000000000000001,8.631980,8.533384,8.582682,0.049298,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
2,0000000000000010,8.340245,8.305612,8.322928,0.017317,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
3,0000000000000011,8.511183,8.454330,8.482757,0.028426,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
4,0000000000000100,8.552056,8.546097,8.549076,0.002980,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
65530,1100100111010000,,,,,6.0,6.0,6.0,0.0,7.967819,7.593409,7.780614,0.187205
65531,1100110111111010,,,,,6.0,6.0,6.0,0.0,9.823908,,9.823908,0.000000
65532,1111010111011110,,,,,6.0,6.0,6.0,0.0,6.000000,6.000000,6.000000,0.000000
65533,0001010111101111,,,,,,,,,9.054181,,9.054181,0.000000


In [21]:
#define categories based on mutation position
CH65_all_filt_rep['pos1'] = CH65_all_filt_rep['geno'].apply(find_pos_1)
CH65_all_filt_rep['pos2'] = CH65_all_filt_rep['geno'].apply(find_pos_2)
CH65_all_filt_rep['pos3'] = CH65_all_filt_rep['geno'].apply(find_pos_3)
CH65_all_filt_rep['pos4'] = CH65_all_filt_rep['geno'].apply(find_pos_4)
CH65_all_filt_rep['pos5'] = CH65_all_filt_rep['geno'].apply(find_pos_5)
CH65_all_filt_rep['pos6'] = CH65_all_filt_rep['geno'].apply(find_pos_6)
CH65_all_filt_rep['pos7'] = CH65_all_filt_rep['geno'].apply(find_pos_7)
CH65_all_filt_rep['pos8'] = CH65_all_filt_rep['geno'].apply(find_pos_8)
CH65_all_filt_rep['pos9'] = CH65_all_filt_rep['geno'].apply(find_pos_9)
CH65_all_filt_rep['pos10'] = CH65_all_filt_rep['geno'].apply(find_pos_10)
CH65_all_filt_rep['pos11'] = CH65_all_filt_rep['geno'].apply(find_pos_11)
CH65_all_filt_rep['pos12'] = CH65_all_filt_rep['geno'].apply(find_pos_12)
CH65_all_filt_rep['pos13'] = CH65_all_filt_rep['geno'].apply(find_pos_13)
CH65_all_filt_rep['pos14'] = CH65_all_filt_rep['geno'].apply(find_pos_14)
CH65_all_filt_rep['pos15'] = CH65_all_filt_rep['geno'].apply(find_pos_15)
CH65_all_filt_rep['pos16'] = CH65_all_filt_rep['geno'].apply(find_pos_16)

#add a column with the number of mutations and save the file
CH65_all_filt_rep['som_mut'] = CH65_all_filt_rep['geno'].apply(sum_digits)
CH65_all_filt_rep.to_csv('../Kd_processed/20221008_CH65_QCfilt_REPfilt.csv', index=False)