## import stuff

In [1]:
#importing things
import pandas as pd
import numpy as np
import csv
import pylab as pl
import matplotlib.pyplot as plt
import copy
import scipy
from scipy.stats import linregress
from scipy.stats import t
from scipy.stats import sem
from scipy.stats import chi2
from scipy.stats import ttest_ind_from_stats
from mpl_toolkits.mplot3d import Axes3D
from scipy.stats import ttest_ind_from_stats
from collections import OrderedDict
import seaborn as sns

## fetch the data

In [2]:
#read in data, remove genotypes that are missing from ch65a
ch65a = pd.DataFrame(pd.read_csv('Kds_CH65a_f3jrem.csv',delimiter=',',dtype={'geno': str})) #here spurious counts for the missing fragment are removed
ch65b = pd.DataFrame(pd.read_csv('Kds_CH65b.tsv',delimiter='\t',dtype={'geno': str}))
ch65d = pd.DataFrame(pd.read_csv('Kds_CH65d.tsv',delimiter='\t',dtype={'geno': str}))
ch65e = pd.DataFrame(pd.read_csv('Kds_CH65e.tsv',delimiter='\t',dtype={'geno': str}))
ch65g = pd.DataFrame(pd.read_csv('Kds_CH65g.tsv',delimiter='\t',dtype={'geno': str}))
ch65h = pd.DataFrame(pd.read_csv('Kds_CH65h.tsv',delimiter='\t',dtype={'geno': str}))

## Pinning Kd to boundaries, removing poor fits, merging replicates

In [3]:
#CH65A -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65a_filt = ch65a.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65a_filt.log10Kd < 6)|((ch65a_filt['mean_log10PE1'] < (1 + ch65a_filt['mean_log10PE0'])))
ch65a_filt.loc[boolean_condition, column_name] = new_value
ch65a_filt_out = ch65a_filt.loc[((ch65a_filt['log10Kd'] > 6) & (ch65a_filt['sigma'] <= 1) & (ch65a_filt['r2'] >= 0.8))|(ch65a_filt['log10Kd'] <= 6)]
print(ch65a['log10Kd'].mean(),ch65a_filt_out['log10Kd'].mean())
ch65a_filt_out.shape[0]

9.395241171076515 9.395160268993848


61424

In [4]:
#CH65B -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65b_filt = ch65b.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65b_filt.log10Kd < 6)|((ch65b_filt['mean_log10PE1'] < (1 + ch65b_filt['mean_log10PE0'])))
ch65b_filt.loc[boolean_condition, column_name] = new_value
ch65b_filt_out = ch65b_filt.loc[((ch65b_filt['log10Kd'] > 6) & (ch65b_filt['sigma'] <= 1) & (ch65b_filt['r2'] >= 0.8))|(ch65b_filt['log10Kd'] <= 6)]
print(ch65b['log10Kd'].mean(),ch65b_filt_out['log10Kd'].mean())
ch65b_filt_out.shape[0]

9.379108218013254 9.379099644042324


65522

In [5]:
#CH65D -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65d_filt = ch65d.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65d_filt.log10Kd < 6)|((ch65d_filt['mean_log10PE1'] < (1 + ch65d_filt['mean_log10PE0'])))
ch65d_filt.loc[boolean_condition, column_name] = new_value
ch65d_filt_out = ch65d_filt.loc[((ch65d_filt['log10Kd'] > 6) & (ch65d_filt['sigma'] <= 1) & (ch65d_filt['r2'] >= 0.8))|(ch65d_filt['log10Kd'] <= 6)]
print(ch65d['log10Kd'].mean(),ch65d_filt_out['log10Kd'].mean())
ch65d_filt_out.shape[0]

6.117678780048976 6.6740596010091


65397

In [6]:
#CH65E -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65e_filt = ch65e.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65e_filt.log10Kd < 6)|((ch65e_filt['mean_log10PE1'] < (1 + ch65e_filt['mean_log10PE0'])))
ch65e_filt.loc[boolean_condition, column_name] = new_value
ch65e_filt_out = ch65e_filt.loc[((ch65e_filt['log10Kd'] > 6) & (ch65e_filt['sigma'] <= 1) & (ch65e_filt['r2'] >= 0.8))|(ch65e_filt['log10Kd'] <= 6)]
print(ch65e['log10Kd'].mean(),ch65e_filt_out['log10Kd'].mean())
ch65e_filt_out.shape[0]

6.168214719931285 6.715010187463439


65460

In [7]:
#ch65g -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65g_filt = ch65g.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65g_filt.log10Kd < 6)|((ch65g_filt['mean_log10PE1'] < (1 + ch65g_filt['mean_log10PE0'])))
ch65g_filt.loc[boolean_condition, column_name] = new_value
ch65g_filt_out = ch65g_filt.loc[((ch65g_filt['log10Kd'] > 6) & (ch65g_filt['sigma'] <= 1) & (ch65g_filt['r2'] >= 0.8))|(ch65g_filt['log10Kd'] <= 6)]
print(ch65g['log10Kd'].mean(),ch65g_filt_out['log10Kd'].mean())
ch65g_filt_out.shape[0]

8.293162365445946 8.25452675972862


64677

In [8]:
#ch65h -- replacing things below the boundary with the boundary & then filter out all poor fits for binders
ch65h_filt = ch65h.copy()
column_name = "log10Kd"
new_value = 6
boolean_condition = (ch65h_filt.log10Kd < 6)|((ch65h_filt['mean_log10PE1'] < (1 + ch65h_filt['mean_log10PE0'])))
ch65h_filt.loc[boolean_condition, column_name] = new_value
ch65h_filt_out = ch65h_filt.loc[((ch65h_filt['log10Kd'] > 6) & (ch65h_filt['sigma'] <= 1) & (ch65h_filt['r2'] >= 0.8))|(ch65h_filt['log10Kd'] <= 6)]
print(ch65h['log10Kd'].mean(),ch65h_filt_out['log10Kd'].mean())
ch65h_filt_out.shape[0]

8.352648066440839 8.282124527815451


64429

## merging and averaging

In [9]:
MA90_filt = pd.DataFrame()
MA90_filt = pd.merge(ch65a_filt_out, ch65b_filt_out, on='geno', how='outer',suffixes=("_a", "_b"))
MA90_filt['log10Kd'] = MA90_filt[['log10Kd_a', 'log10Kd_b']].mean(axis=1)
MA90_filt['err_log10Kd'] = MA90_filt[['log10Kd_a', 'log10Kd_b']].sem(axis=1)
MA90_filt['mean_sigma'] = MA90_filt[['sigma_a', 'sigma_b']].sem(axis=1)
MA90_filt['err_log10Kd'] = MA90_filt['err_log10Kd'].fillna(0)
MA90_filt['log10Kd_pinned'] = MA90_filt['log10Kd']
MA90_filt['expression_norm_a'] = MA90_filt['Mean fluorescence expression_a']/np.mean(MA90_filt['Mean fluorescence expression_a'])
MA90_filt['expression_norm_b'] = MA90_filt['Mean fluorescence expression_b']/np.mean(MA90_filt['Mean fluorescence expression_b'])
MA90_filt['expression_norm'] = (MA90_filt['expression_norm_a']+MA90_filt['expression_norm_b'])/2
MA90_filt_rep = MA90_filt[MA90_filt['err_log10Kd']<0.5].copy()
print(MA90_filt.shape[0],MA90_filt_rep.shape[0])
MA90_filt_rep.to_csv('../Kd_processed/20221008_CH65_MA90_browser.tsv', index=False, sep="\t")

65533 65530


In [10]:
SI06_filt = pd.DataFrame()
SI06_filt = pd.merge(ch65d_filt_out, ch65e_filt_out, on='geno', how='outer',suffixes=("_d", "_e"))
SI06_filt['log10Kd'] = SI06_filt[['log10Kd_d', 'log10Kd_e']].mean(axis=1)
SI06_filt['err_log10Kd'] = SI06_filt[['log10Kd_d', 'log10Kd_e']].sem(axis=1)
SI06_filt['mean_sigma'] = SI06_filt[['sigma_d', 'sigma_e']].sem(axis=1)
SI06_filt['err_log10Kd'] = SI06_filt['err_log10Kd'].fillna(0)
SI06_filt['log10Kd_pinned'] = SI06_filt['log10Kd']
SI06_filt['expression_norm_d'] = SI06_filt['Mean fluorescence expression_d']/np.mean(SI06_filt['Mean fluorescence expression_d'])
SI06_filt['expression_norm_e'] = SI06_filt['Mean fluorescence expression_e']/np.mean(SI06_filt['Mean fluorescence expression_e'])
SI06_filt['expression_norm'] = (SI06_filt['expression_norm_d']+SI06_filt['expression_norm_e'])/2
SI06_filt_rep = SI06_filt[SI06_filt['err_log10Kd']<0.5].copy()
print(SI06_filt.shape[0],SI06_filt_rep.shape[0])
SI06_filt.to_csv('../Kd_processed/20221008_CH65_SI06_browser.tsv', index=False, sep="\t")

65533 65389


In [11]:
G189E_filt = pd.DataFrame()
G189E_filt = pd.merge(ch65g_filt_out, ch65h_filt_out, on='geno', how='outer',suffixes=("_g", "_h"))
G189E_filt['log10Kd'] = G189E_filt[['log10Kd_g', 'log10Kd_h']].mean(axis=1)
G189E_filt['err_log10Kd'] = G189E_filt[['log10Kd_g', 'log10Kd_h']].sem(axis=1)
G189E_filt['mean_sigma'] = G189E_filt[['sigma_g', 'sigma_h']].sem(axis=1)
G189E_filt['err_log10Kd'] = G189E_filt['err_log10Kd'].fillna(0)
G189E_filt['log10Kd_pinned'] = G189E_filt['log10Kd']
# G189E_filt['expression_norm_g'] = G189E_filt['Mean fluorescence expression_g']/np.mean(G189E_filt['Mean fluorescence expression_g'])
# G189E_filt['expression_norm_h'] = G189E_filt['Mean fluorescence expression_h']/np.mean(G189E_filt['Mean fluorescence expression_h'])
# G189E_filt['expression_norm'] = (G189E_filt['expression_norm_g']+G189E_filt['expression_norm_h'])/2
G189E_filt_rep = G189E_filt[G189E_filt['err_log10Kd']<0.5].copy()
print(G189E_filt.shape[0],G189E_filt_rep.shape[0])
G189E_filt.to_csv('../Kd_processed/20221008_CH65_G189E_browser.tsv', index=False, sep="\t")

65285 64142
