In [2]:
import sys
sys.path.append('code/transethnic_prs-main/')
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import transethnic_prs.model1.Model1Blk as model1blk
from scipy import optimize
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [3]:
gene = 'ENSG00000075234.12'
gene_name = 'TTC38'

pheno_total = pd.read_csv('data/GD462.GeneQuantRPKM.50FN.samplename.resk10.txt', sep = '\t', index_col = 'TargetID')
target_pheno_total = pheno_total.loc[gene]

eur_sample = pd.read_csv('data/clean/'+gene_name+'_genotype/eur_'+gene_name+'_genotype.012.indv', sep = '\t', header = None)
afr_sample = pd.read_csv('data/clean/'+gene_name+'_genotype/afr_'+gene_name+'_genotype.012.indv', sep = '\t',header = None)

target_pheno_eur = pd.merge(target_pheno_total, eur_sample, left_index = True, right_on = 0)
target_pheno_afr = pd.merge(target_pheno_total, afr_sample, left_index = True, right_on = 0)

eur_genotype = pd.read_csv('data/clean/'+gene_name+'_genotype/eur_'+gene_name+'_genotype.012', sep = '\t', header = None, index_col = 0)
afr_genotype = pd.read_csv('data/clean/'+gene_name+'_genotype/afr_'+gene_name+'_genotype.012', sep = '\t', header = None, index_col = 0)

pa = target_pheno_afr.set_index(0)
pe = target_pheno_eur.set_index(0)

#make sure the genotype matrix's order is the same as that of phenotype vector
sorted_eur_pheno = pd.merge(eur_sample, pe, left_on = 0 , right_index = True, how = 'left')
sorted_afr_pheno = pd.merge(afr_sample, pa, left_on = 0 , right_index = True, how = 'left')

In [4]:
#original matrix(before standardization)
X1o = np.array(eur_genotype,dtype = np.float64,order = 'C')
X2o = np.array(afr_genotype,dtype = np.float64,order = 'C')
y1o = np.array(sorted_eur_pheno[gene],dtype = np.float64,order = 'C')
y2o = np.array(sorted_afr_pheno[gene],dtype = np.float64,order = 'C')

In [5]:
def SNP_var_check(X):
    col_valid = []
    count = 0
    for col,x in enumerate(np.std(X, axis = 0)):
        if x==0:
            count +=1
        else:
            col_valid.append(col)
    return col_valid, count

In [6]:
def standardization(x):
    x=np.array(x,dtype = np.float64,order = 'C')
    x_center = x - np.mean(x,axis = 0)
    return x_center/np.std(x, axis = 0)

In [7]:
X2o.shape

(89, 284)

In [8]:
col_valid1,count1=SNP_var_check(X2o)
X2o=X2o[:,col_valid1]
X2o.shape

(89, 226)

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X2o,y2o,test_size = 0.2, random_state = 9,shuffle = False)
print(X_train.shape,X_test.shape)

(71, 226) (18, 226)


In [10]:
col_valid2,count2 = SNP_var_check(X_test)

In [11]:
kf = KFold(n_splits=5)#without shuffling, the random state is immutable

In [12]:
i = 1
for train_index, test_index in kf.split(X_train):
    train_index = list(train_index)
    test_index = list(test_index)
    X_to,X_vo = X_train[train_index,:],X_train[test_index,:]
    y_to, y_vo = y_train[train_index],y_train[test_index]
    col_valid_train,count_train = SNP_var_check(X_to)
    col_valid_valid,count_valid = SNP_var_check(X_vo)
    print("count_train%s:" % i,count_train,"count_validation%s:" % i,count_valid)
    col_valid3 = list(set(col_valid_train).intersection(set(col_valid_valid)))
    globals()['col_set%s' % i] = col_valid3
    i+=1

valid_col_trainset = list(set(col_set1).intersection(col_set2).intersection(col_set3).intersection(col_set4).intersection(col_set5))

count_train1: 19 count_validation1: 20
count_train2: 18 count_validation2: 21
count_train3: 18 count_validation3: 37
count_train4: 18 count_validation4: 21
count_train5: 17 count_validation5: 27


In [13]:
col_valid_final = list(set(col_valid1).intersection(col_valid2).intersection(valid_col_trainset))
len(col_valid_final)

139

In [14]:
X1o = X1o[:,col_valid_final]
X1,y1 = standardization(X1o), standardization(y1o)
A1 = X1.T @ X1
b1 = X1.T @ y1

In [15]:
X_test = X_test[:,col_valid_final]
X_test.shape

(18, 139)

In [16]:
X_test_std = standardization(X_test)
y_test_std = standardization(y_test)

In [17]:
Max1 = 0
for train_index, test_index in kf.split(X_train):
    X_to, X_vo = X_train[:,col_valid_final][train_index,:], X_train[:,col_valid_final][test_index,:]
    y_to, y_vo = y_train[train_index], y_train[test_index]
    X_t_std, X_v_std, y_t_std, y_v_std = standardization(X_to), standardization(X_vo), standardization(y_to), standardization(y_vo)
    mod = model1blk.Model1Blk([A1],[b1],[X_t_std],y_t_std)
    beta_mat_en, lambda_seq_en, niters_en, tols_en, convs_en = mod.solve_path(alpha=0.1)
    Max2 = 0
    for i in range(100):
        beta_hat = beta_mat_en[:,i]
        y_v_hat = X_v_std @ beta_hat
        correlation_matrix = np.corrcoef(y_v_hat, y_v_std)
        correlation_yvhat_yv = correlation_matrix[0,1]
        r2 = correlation_yvhat_yv**2
        if r2>Max2:
            Max2=r2
            idx = i
    print(Max2)
    if Max2>Max1:
        Max1=Max2
        lam=lambda_seq_en[idx]
        beta_h=beta_mat_en[:,idx]

print(Max1,idx,lam,beta_h)

  c /= stddev[:, None]
  c /= stddev[None, :]


0.7703807191527408
0.45845245317436406
0.8469820247785971
0.8312186669562153
0.8629786235569479
0.8629786235569479 1 3286.779882122841 [-0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.          0.00170769
  0.00170665 -0.          0.0017051  -0.         -0.          0.00216092
 -0.

In [62]:
y_hat_test= X_test_std @ beta_h
y_hat_test = (y_hat_test-y_hat_test.mean())/y_hat_test.std()
correlation_test_matrix = np.corrcoef(y_hat_test, y_test_std)
correlation_test = correlation_test_matrix[0,1]
r2 = correlation_test**2


m, b = np.polyfit(y_hat_test,y_test_std, 1)
print(y_hat_test.std())
print(correlation_test,r2,m)
y_test_std.std()

0.9999999999999998
0.7259192145055929 0.5269587059884171 0.7259192145055935


0.9999999999999999

In [58]:
from scipy.stats import pearsonr
r,_ = pearsonr(y_hat_test,y_test_std)
np.corrcoef(y_hat_test,y_test_std)

array([[1.        , 0.72591921],
       [0.72591921, 1.        ]])

In [63]:
from scipy.stats import spearmanr
sp_r, _ = spearmanr(y_hat_test,y_test_std)
sp_r

0.6838825616055229

In [59]:
A = np.vstack([y_test_std, np.ones(len(y_test_std))]).T
m, c = np.linalg.lstsq(A, y_hat_test, rcond=None)[0]
m

0.007529084129845443