In [15]:
import numpy as np
import csv
# import sys
import random
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm
from sklearn import linear_model
import matplotlib.pyplot as plt 
import gc
from tqdm.notebook import tqdm

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
# ep_type = 'stat'

# read in data
geno_vectors_SI06 = []
phenos_SI06 = []

mutations_H1 = [str(x) for x in range(1,17)]


with open('../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv','r') as readfile:
    kd_reader = csv.reader(readfile)
    header = next(kd_reader)
    for row in kd_reader:
        geno = row[0]
        
        geno_vec = np.array([float(x) for ii, x in enumerate(geno) if ii != 10])

        pheno_SI06 = row[7] # row for SI06
        
        if len(pheno_SI06) != 0 and row[23] == '1':  
            geno_vectors_SI06.append(geno_vec)
            phenos_SI06.append(float(pheno_SI06))

    readfile.close()



In [16]:
phenos_SI06 = np.array(phenos_SI06)

genos_SI06 = np.empty((len(phenos_SI06),len(geno_vectors_SI06[0])))
for i in range(len(phenos_SI06)):
    genos_SI06[i] = geno_vectors_SI06[i][:]
    
if ep_type == 'stat':
    genos_SI06 = 2*(genos_SI06-0.5)  

num_folds = 8
max_order = 7

# proportion of data to be tested 
prop_test = 0.1

size_test_SI06 = int(prop_test*len(genos_SI06))
size_train_SI06 = len(genos_SI06)-size_test_SI06

# lists to store r squared values
rsq_train_list_SI06 = np.zeros((num_folds, max_order+1))
rsq_test_list_SI06 = np.zeros((num_folds, max_order+1))



# loop over CV folds
for f in tqdm(range(num_folds)):
    #randomly selects 
    indices_permuted_SI06 = random.sample(range(0,len(genos_SI06)), size_test_SI06)

    genos_train_SI06 = np.delete(genos_SI06.copy(), indices_permuted_SI06, 0)
    genos_test_SI06 = genos_SI06[indices_permuted_SI06].copy()
    phenos_train_SI06 = np.delete(phenos_SI06, indices_permuted_SI06, 0)
    phenos_test_SI06 = phenos_SI06[indices_permuted_SI06].copy()

    # fit models of increasing order
    for order in range(0,max_order+1):
        reg_SI06_current = linear_model.Ridge(alpha=0.0001, solver='lsqr', fit_intercept=False)
        poly_SI06_current = PolynomialFeatures(order,interaction_only=True)
        genos_train_SI06_current = poly_SI06_current.fit_transform(genos_train_SI06)
        genos_test_SI06_current = poly_SI06_current.fit_transform(genos_test_SI06)
        reg_SI06_current.fit(genos_train_SI06_current, phenos_train_SI06)
        reg_SI06_coefs_current  = reg_SI06_current.coef_

        #reg_SI06_current_predict = reg_SI06_coefs_current
        rsquared_train_SI06_current = 1-np.sum((phenos_train_SI06-reg_SI06_current.predict(genos_train_SI06_current))**2)/np.sum((phenos_train_SI06-np.mean(phenos_train_SI06))**2)
        rsquared_test_SI06_current = 1-np.sum((phenos_test_SI06-reg_SI06_current.predict(genos_test_SI06_current))**2)/np.sum((phenos_test_SI06-np.mean(phenos_test_SI06))**2)
        rsq_train_list_SI06[f, order] = rsquared_train_SI06_current
        rsq_test_list_SI06[f, order] = rsquared_test_SI06_current
        
        #print(rsquared_train_SI06_current)
        #print(rsquared_test_SI06_current)              
    del reg_SI06_current
    del indices_permuted_SI06
    del genos_train_SI06
    del genos_test_SI06
    del phenos_train_SI06
    del phenos_test_SI06
    del reg_SI06_coefs_current
    del poly_SI06_current
    gc.collect()
        


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [22]:
import pandas as pd
lst = []
num_folds = 8
df = pd.DataFrame()
for f in range(num_folds):
    for o in range(0,max_order+1):
        lst += [(f, o, rsq_train_list_SI06[f, o], rsq_test_list_SI06[f,o])]
df = pd.DataFrame(lst, columns=["fold_nb", "order", "train", "test"])
df.to_csv(f"r2_CV_{ep_type}_SI06.csv", index=False)

In [25]:
df = pd.read_csv(f"r2_CV_biochem_SI06.csv")
df.groupby("order").agg({"train":"mean", "test":"mean"})

Unnamed: 0_level_0,train,test
order,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-8.881784e-16,-0.000553
1,0.6367044,0.628192
2,0.8788008,0.873885
3,0.9311445,0.92568
4,0.9497707,0.939864
5,0.96064,0.944228
6,0.9641763,0.942972
7,0.9661865,0.941879


In [13]:
import numpy as np
import csv
# import sys
from sklearn.preprocessing import PolynomialFeatures
import statsmodels.api as sm

# choose statistical or biochemical epistasis
ep_type = 'biochem' 
# ep_type = 'stat'

# read in data
geno_vectors_SI06 = []
phenos_SI06 = []

mutations_SI06 = [str(x) for x in range(1,17)]

with open('../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv','r') as readfile:
    kd_reader = csv.reader(readfile)
    header = next(kd_reader)
    for row in kd_reader:
        geno = row[0]
        
        geno_vec = np.array([float(x) for ii, x in enumerate(geno) if ii != 10])

        pheno_SI06 = row[7]
        
            
        if len(pheno_SI06) != 0 and row[23] == '1':  
            geno_vectors_SI06.append(geno_vec)
            phenos_SI06.append(float(pheno_SI06))
    readfile.close()

phenos_SI06 = np.array(phenos_SI06)

genos_SI06 = np.empty((len(phenos_SI06),len(geno_vectors_SI06[0])))
for i in range(len(phenos_SI06)):
    genos_SI06[i] = geno_vectors_SI06[i][:]
    


if ep_type == 'stat':
    genos_SI06 = 2*(genos_SI06-0.5)

# print(genos_SI06.shape,phenos_SI06.shape)

optimal_SI06_order = 5
# # Fit final models



In [14]:
np.random.seed(2112)
indices_permuted_SI06 = np.random.permutation(np.arange(len(genos_SI06)))

# fit models of increasing order
for order in range(1,optimal_SI06_order+1):
# for order in range(1,2):
    # print(order)
    genos_SI06_permuted = genos_SI06[indices_permuted_SI06]
    # small noise to avoid numeric issues
    genos_SI06_permuted += 0.0001 * np.random.normal(size=genos_SI06_permuted.shape)
    phenos_SI06_permuted = phenos_SI06[indices_permuted_SI06]
    # print('Order: ',str(order),file=sys.stdout,flush=True)
    poly_SI06_current = PolynomialFeatures(order,interaction_only=True)
    genos_SI06_current = poly_SI06_current.fit_transform(genos_SI06_permuted)

    # fit
    reg_SI06_current = sm.OLS(phenos_SI06_permuted,genos_SI06_current).fit()
    reg_SI06_coefs_current = reg_SI06_current.params
    reg_SI06_CIs_current = reg_SI06_current.conf_int(alpha=0.05/float(len(reg_SI06_coefs_current)), cols=None)
    reg_SI06_stderr = reg_SI06_current.bse
    reg_SI06_pvalues = reg_SI06_current.pvalues
    
    num_sig = len(np.where(reg_SI06_pvalues < 0.05/float(len(reg_SI06_coefs_current)))[0])

    predicted_phenos_permuted_SI06 = reg_SI06_current.predict(genos_SI06_current)
    rsquared_SI06_current = reg_SI06_current.rsquared
    # print('Params: ',len(reg_SI06_coefs_current),file=sys.stdout,flush=True)
    # print('Performance: ',rsquared_SI06_current,file=sys.stdout,flush=True)
    # print(num_sig,file=sys.stdout,flush=True)
	 

    # write model to file
    coef_names = poly_SI06_current.get_feature_names(input_features = mutations_SI06)
    with open('statistical/CH65_SI06_newdata_'+str(order)+'order_'+ep_type+'.txt','w') as writefile:
        coef_writer = csv.writer(writefile,delimiter='\t')
        coef_writer.writerow(['Params: ',len(reg_SI06_coefs_current)])
        coef_writer.writerow(['Performance: ',rsquared_SI06_current])
        coef_writer.writerow(['Term','Coefficient','Standard Error','p-value','95% CI lower','95% CI upper'])
        coef_writer.writerow(['Intercept',reg_SI06_coefs_current[0]])
        for i in range(1,len(reg_SI06_coefs_current)):
            coef_writer.writerow([','.join(coef_names[i].split(' ')),reg_SI06_coefs_current[i],reg_SI06_stderr[i],
                                  reg_SI06_pvalues[i],reg_SI06_CIs_current[i][0],reg_SI06_CIs_current[i][1]])
        writefile.close()






In [None]:
phenos_SI06

In [16]:
len(phenos_SI06)

31924

In [17]:
len(genos)

NameError: name 'genos' is not defined

In [18]:
len(geno_vectors_SI06)

31924

In [23]:
!wc -l  '../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv'

65536 ../../Kd_Inference/results_CH65/Kd_processed/20221008_CH65_QCfilt_REPfilt.csv


In [None]:
break