In [1]:
import numpy as np
import csv
# import sys
import random
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
from sklearn import linear_model
import matplotlib.pyplot as plt 
import gc
from tqdm.notebook import tqdm

# choose statistical or biochemical epistasis
#ep_type = 'biochem' 
ep_type = 'stat'

# read in data
geno_vectors_G189E = []
phenos_G189E = []

mutations_H1 = [str(x) for x in range(1,17)]


with open('../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv','r') as readfile:
    kd_reader = csv.reader(readfile)
    header = next(kd_reader)
    for row in kd_reader:
        geno = row[0]
        
        geno_vec = np.array([float(x) for x in geno])

        pheno_G189E = row[11]
        
            
        if len(pheno_G189E) != 0:  
            geno_vectors_G189E.append(geno_vec)
            phenos_G189E.append(float(pheno_G189E))
    readfile.close()



In [2]:


phenos_G189E = np.array(phenos_G189E)

genos_G189E = np.empty((len(phenos_G189E),len(geno_vectors_G189E[0])))
for i in range(len(phenos_G189E)):
    genos_G189E[i] = geno_vectors_G189E[i][:]
if ep_type == 'stat':
    genos_G189E = 2*(genos_G189E-0.5)    


num_folds = 8
max_order = 7

# proportion of data to be tested 
prop_test = 0.1

size_test_G189E = int(prop_test*len(genos_G189E))
size_train_G189E = len(genos_G189E)-size_test_G189E

# lists to store r squared values
rsq_train_list_G189E = np.zeros((num_folds, max_order+1))
rsq_test_list_G189E = np.zeros((num_folds, max_order+1))



# loop over CV folds
for f in tqdm(range(num_folds)):
    #randomly selects 
    indices_permuted_G189E = random.sample(range(0,len(genos_G189E)), size_test_G189E)

    genos_train_G189E = np.delete(genos_G189E.copy(), indices_permuted_G189E, 0)
    genos_test_G189E = genos_G189E[indices_permuted_G189E].copy()
    phenos_train_G189E = np.delete(phenos_G189E, indices_permuted_G189E, 0)
    phenos_test_G189E = phenos_G189E[indices_permuted_G189E].copy()

    # fit models of increasing order
    for order in range(0,max_order+1):
        reg_G189E_current = linear_model.Ridge(alpha=0.01, solver='lsqr', fit_intercept=False)
        poly_G189E_current = PolynomialFeatures(order,interaction_only=True)
        genos_train_G189E_current = poly_G189E_current.fit_transform(genos_train_G189E)
        genos_test_G189E_current = poly_G189E_current.fit_transform(genos_test_G189E)
        reg_G189E_current.fit(genos_train_G189E_current, phenos_train_G189E)
        reg_G189E_coefs_current  = reg_G189E_current.coef_

        #reg_G189E_current_predict = reg_G189E_coefs_current
        rsquared_train_G189E_current = 1-np.sum((phenos_train_G189E-reg_G189E_current.predict(genos_train_G189E_current))**2)/np.sum((phenos_train_G189E-np.mean(phenos_train_G189E))**2)
        rsquared_test_G189E_current = 1-np.sum((phenos_test_G189E-reg_G189E_current.predict(genos_test_G189E_current))**2)/np.sum((phenos_test_G189E-np.mean(phenos_test_G189E))**2)
        rsq_train_list_G189E[f, order] = rsquared_train_G189E_current
        rsq_test_list_G189E[f, order] = rsquared_test_G189E_current
        
        #print(rsquared_train_G189E_current)
        #print(rsquared_test_G189E_current)              
    del reg_G189E_current
    del indices_permuted_G189E
    del genos_train_G189E
    del genos_test_G189E
    del phenos_train_G189E
    del phenos_test_G189E
    del reg_G189E_coefs_current
    del poly_G189E_current
    gc.collect()
        


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [5]:
import pandas as pd
lst = []
df = pd.DataFrame()
for f in range(num_folds):
    for o in range(0,max_order+1):
        lst += [(f, o, rsq_train_list_G189E[f, o], rsq_test_list_G189E[f,o])]
df = pd.DataFrame(lst, columns=["fold_nb", "order", "train", "test"])
df.to_csv(f"r2_CV_{ep_type}_G189E.csv", index=False)

In [6]:
df.groupby("order").agg({"train":"mean", "test": "mean"})

Unnamed: 0_level_0,train,test
order,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-1.48806e-12,-0.00024
1,0.7373416,0.735049
2,0.9158432,0.915404
3,0.9525397,0.952038
4,0.9704642,0.968379
5,0.9820787,0.977268
6,0.9882465,0.978295
7,0.9927836,0.973885


In [7]:
df.groupby("order").agg({"train":"std", "test": "std"})

Unnamed: 0_level_0,train,test
order,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2.84588e-15,0.000288
1,0.0004263578,0.003907
2,0.0001616847,0.001479
3,0.000169304,0.00155
4,0.0001587945,0.001474
5,0.0001097254,0.001037
6,8.695884e-05,0.000986
7,6.894486e-05,0.00108


In [None]:
import numpy as np
import csv
# import sys
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

# choose statistical or biochemical epistasis
# ep_type = 'biochem' 
ep_type = 'stat'

# read in data
geno_vectors_G189E = []
phenos_G189E = []

mutations_G189E = [str(x) for x in range(1,17)]

with open('../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv','r') as readfile:
    kd_reader = csv.reader(readfile)
    header = next(kd_reader)
    for row in kd_reader:
        geno = row[0]
        
        geno_vec = np.array([float(x) for x in geno])

        pheno_G189E = row[11]
        
            
        if len(pheno_G189E) != 0:  
            geno_vectors_G189E.append(geno_vec)
            phenos_G189E.append(float(pheno_G189E))
    readfile.close()

phenos_G189E = np.array(phenos_G189E)

genos_G189E = np.empty((len(phenos_G189E),len(geno_vectors_G189E[0])))
for i in range(len(phenos_G189E)):
    genos_G189E[i] = geno_vectors_G189E[i][:]
    


if ep_type == 'stat':
    genos_G189E = 2*(genos_G189E-0.5)

# print(genos_G189E.shape,phenos_G189E.shape)


# # Fit final models

np.random.seed(2112)
indices_permuted_G189E = np.random.permutation(np.arange(len(genos_G189E)))

# fit models of increasing order
for order in range(1,optimal_G189E_order+1):
# for order in range(1,2):
    # print(order)
    genos_G189E_permuted = genos_G189E[indices_permuted_G189E]
    phenos_G189E_permuted = phenos_G189E[indices_permuted_G189E]
    # print('Order: ',str(order),file=sys.stdout,flush=True)
    poly_G189E_current = PolynomialFeatures(order,interaction_only=True)
    genos_G189E_current = poly_G189E_current.fit_transform(genos_G189E_permuted)

    # fit
    reg_G189E_current = sm.OLS(phenos_G189E_permuted,genos_G189E_current).fit()
    reg_G189E_coefs_current = reg_G189E_current.params
    reg_G189E_CIs_current = reg_G189E_current.conf_int(alpha=0.05/float(len(reg_G189E_coefs_current)), cols=None)
    reg_G189E_stderr = reg_G189E_current.bse
    reg_G189E_pvalues = reg_G189E_current.pvalues
    
    num_sig = len(np.where(reg_G189E_pvalues < 0.05/float(len(reg_G189E_coefs_current)))[0])

    predicted_phenos_permuted_G189E = reg_G189E_current.predict(genos_G189E_current)
    rsquared_G189E_current = reg_G189E_current.rsquared
    # print('Params: ',len(reg_G189E_coefs_current),file=sys.stdout,flush=True)
    # print('Performance: ',rsquared_G189E_current,file=sys.stdout,flush=True)
    # print(num_sig,file=sys.stdout,flush=True)
	 

    # write model to file
    coef_names = poly_G189E_current.get_feature_names(input_features = mutations_G189E)
    with open('statistical/CH65_G189E_newdata_'+str(order)+'order_'+ep_type+'.txt','w') as writefile:
        coef_writer = csv.writer(writefile,delimiter='\t')
        coef_writer.writerow(['Params: ',len(reg_G189E_coefs_current)])
        coef_writer.writerow(['Performance: ',rsquared_G189E_current])
        coef_writer.writerow(['Term','Coefficient','Standard Error','p-value','95% CI lower','95% CI upper'])
        coef_writer.writerow(['Intercept',reg_G189E_coefs_current[0]])
        for i in range(1,len(reg_G189E_coefs_current)):
            coef_writer.writerow([','.join(coef_names[i].split(' ')),reg_G189E_coefs_current[i],reg_G189E_stderr[i],
                                  reg_G189E_pvalues[i],reg_G189E_CIs_current[i][0],reg_G189E_CIs_current[i][1]])
        writefile.close()




