In [1]:
%matplotlib inline
import pylab as pl
import numpy as np
import pandas as pd

#Read data

X = []
samples_x = []

f = open("genotypes.csv","r")

for i,line in enumerate(f):
    if(i == 0):
       continue
    
    sv = line.strip().split(",")
    samples_x.append(sv[0])
    
    row = []
    for element in sv[1:]:
        row.append(element)
    X.append(row)
    
f.close()

X = np.array(X)
samples_x = np.array(samples_x)

print(samples_x)
print(X)


['9597' '9143' '8967' ... '1954' '9707' '6120']
[['G' 'G' 'G' ... '?' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'A']
 ['A' 'G' 'A' ... 'A' '?' 'A']
 ...
 ['G' 'A' 'G' ... 'T' 'T' 'A']
 ['G' 'G' 'A' ... 'A' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'C']]


In [2]:
#Replace "?" with np.nan

X = np.where(X == "?", np.nan, X)

print(X)

[['G' 'G' 'G' ... 'nan' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'A']
 ['A' 'G' 'A' ... 'A' 'nan' 'A']
 ...
 ['G' 'A' 'G' ... 'T' 'T' 'A']
 ['G' 'G' 'A' ... 'A' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'C']]


In [3]:
#Encode nucleotide values

for j in range (X.shape[1]):
    
    values, counts = np.unique(X[:,j], return_counts=True)
    values=values[:2]
    counts=counts[:2]

    #print(values[np.argmax(counts)])
    #print(values[np.argmin(counts)])

    X[:,j] = np.where(X[:,j] == values[np.argmax(counts)], 0.0, X[:,j])
    X[:,j] = np.where(X[:,j] == values[np.argmin(counts)], 2.0, X[:,j])

#Convert array from string to float
X = X.astype(np.float_)

print(X)
    

[[ 0.  0.  0. ... nan  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 2.  0.  2. ...  0. nan  0.]
 ...
 [ 0.  2.  0. ...  2.  0.  0.]
 [ 0.  0.  2. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  2.]]


In [4]:
# General analysis data:

print("No Samples:\t\t %d" % X.shape[0])
print("No Features:\t\t %d" % X.shape[1])

missing = ((np.isnan(X)).sum()/X.size*100)
print("Missing Values:\t\t %.2f%%" % missing)

No Samples:		 1826
No Features:		 5000
Missing Values:		 10.00%


In [5]:
# Assess missing values per sample


sample_indices_to_remove = []
for i in range(X.shape[0]):
    missing = np.isnan(X[i,:]).sum()/X.shape[1]*100
    if missing>50:
        print("Sample %d has %.2f missing values" % (i,missing))
        sample_indices_to_remove.append(i)

if (len(sample_indices_to_remove) == 0):
    print("All samples have less than 50 % missing values")

All samples have less than 50 % missing values


In [7]:
# Read phenotype values

y = []
samples_y = []

f = open("phenotype_values.csv","r")

for i,line in enumerate(f):
    if(i == 0):
       continue
    sv = line.strip().split(",")
    samples_y.append(sv[0])
    y.append(float(sv[1]))
    
f.close()

y = np.array(y)
samples_y = np.array(samples_y)

print (samples_y)
print (y)

['9608' '9801' '9647' ... '9894' '6200' '9917']
[53.43 58.81 53.57 ... 53.68 54.62 53.35]


In [8]:
# Match genotypes with phenotype values
truth_table = (samples_x[:,np.newaxis]==samples_y)
ind = np.where(truth_table==True)

samples_x = samples_x[ind[0]]
samples_y = samples_y[ind[1]]

X = X[ind[0],:]
y = y[ind[1]]

print("No Samples X:\t\t%d" % X.shape[0])
print("No Samples y:\t\t%d" % y.shape[0])

print("Samples correct order:\t" + str(np.any(samples_x==samples_y)))


No Samples X:		1826
No Samples y:		1826
Samples correct order:	True


In [9]:
# Split into train data and test data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=73)

print("Full Data:\t" + str(X.shape))
print("Train Data:\t" + str(X_train.shape))
print("Test Data:\t" + str(X_test.shape))



Full Data:	(1826, 5000)
Train Data:	(1460, 5000)
Test Data:	(366, 5000)


In [10]:
# Impute missing values on training data

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")
X_train = imputer.fit_transform(X_train) 
X_test = imputer.fit_transform(X_test) 

print(X_train)


[[0. 0. 0. ... 0. 2. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [2. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 2. ... 0. 0. 2.]
 [0. 0. 0. ... 0. 0. 2.]]


In [11]:
# Linear Regression

import sklearn.metrics as metrics
from sklearn import linear_model


model1 = linear_model.LinearRegression()
model1.fit(X_train, y_train)
predictions_training = model1.predict(X_train)
predictions_testing = model1.predict(X_test)

print("MSE (Train):\t%.2f" % metrics.mean_squared_error(y_train, predictions_training))
print("R2 (Train):\t%.2f" % metrics.r2_score(y_train, predictions_training))
print("MSE (Test):\t%.2f" % metrics.mean_squared_error(y_test, predictions_testing))
print("R2 (Test):\t%.2f" % metrics.r2_score(y_test, predictions_testing))

MSE (Train):	0.00
R2 (Train):	1.00
MSE (Test):	4.50
R2 (Test):	0.55


In [12]:
from sklearn import linear_model
model2=linear_model.Ridge(alpha=5000)
model2.fit(X_train, y_train)
predictions_training = model2.predict(X_train)
predictions_testing = model2.predict(X_test)

print("MSE (Train):\t%.2f" % metrics.mean_squared_error(y_train, predictions_training))
print("R2 (Train):\t%.2f" % metrics.r2_score(y_train, predictions_training))
print("MSE (Test):\t%.2f" % metrics.mean_squared_error(y_test, predictions_testing))
print("R2 (Test):\t%.2f" % metrics.r2_score(y_test, predictions_testing))

MSE (Train):	2.79
R2 (Train):	0.73
MSE (Test):	4.63
R2 (Test):	0.54


In [52]:
from sklearn import linear_model
model3=linear_model.ElasticNet(alpha=0.13135893963030046,l1_ratio=0.09,max_iter=7000)
model3.fit(X_train, y_train)
predictions_training = model3.predict(X_train)
predictions_testing = model3.predict(X_test)

print("MSE (Train):\t%.2f" % metrics.mean_squared_error(y_train, predictions_training))
print("R2 (Train):\t%.2f" % metrics.r2_score(y_train, predictions_training))
print("MAE (Train):\t %.2f" %metrics.mean_absolute_error(y_train, predictions_training))
print("MSE (Test):\t%.2f" % metrics.mean_squared_error(y_test, predictions_testing))
print("R2 (Test):\t%.2f" % metrics.r2_score(y_test, predictions_testing))
print("MAE (Test):\t %.2f" %metrics.mean_absolute_error(y_test, predictions_testing))

MSE (Train):	0.65
R2 (Train):	0.94
MAE (Train):	 0.63
MSE (Test):	3.78
R2 (Test):	0.62
MAE (Test):	 1.54


In [49]:
#ELASTIC NET WITH CROSS VALIDATION
from sklearn import linear_model
l1_ratio_list=np.arange(0.01,0.1,0.01)
model4=linear_model.ElasticNetCV(cv=5,l1_ratio=l1_ratio_list,max_iter=5000,random_state=73)
model4.fit(X_train, y_train)
predictions_training = model4.predict(X_train)
predictions_testing = model4.predict(X_test)

print("MSE (Train):\t%.2f" % metrics.mean_squared_error(y_train, predictions_training))
print("R2 (Train):\t%.2f" % metrics.r2_score(y_train, predictions_training))
print("MSE (Test):\t%.2f" % metrics.mean_squared_error(y_test, predictions_testing))
print("R2 (Test):\t%.2f" % metrics.r2_score(y_test, predictions_testing))


MSE (Train):	0.65
R2 (Train):	0.94
MSE (Test):	3.78
R2 (Test):	0.62


  positive)


In [50]:
print(model4.alpha_, model4.l1_ratio_, model4.coef_,model4.intercept_)


0.13135893963030046 0.09 [ 0.00000000e+00 -1.96800824e-02  3.10391780e-03 -0.00000000e+00
  3.50625518e-02 -0.00000000e+00  2.95573053e-02  2.52422158e-02
  0.00000000e+00 -6.09974299e-03 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  2.38444955e-02 -0.00000000e+00  1.27350776e-02
 -4.58944540e-03  1.71756911e-02 -3.45962960e-02  1.77333474e-02
 -0.00000000e+00  1.89492038e-02  0.00000000e+00 -0.00000000e+00
  3.39287454e-03  3.14433223e-03 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  3.71415452e-02  9.23278437e-02
  0.00000000e+00  1.01567566e-01 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  1.73721087e-02
 -6.04289066e-02  7.95711242e-03 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -3.93477383e-03


In [51]:
np.set_printoptions(edgeitems=127)
#print(model1.intercept_,model1.coef_)
#print(model2.intercept_,model2.coef_)
#print(model3.intercept_,model3.coef_[model3.coef_!=0.].size,model3.coef_)
print(model4.intercept_,model4.coef_[model3.coef_!=0.].size,model4.coef_)

51.94138084221841 1498 [ 0.00000000e+00 -1.96800824e-02  3.10391780e-03 -0.00000000e+00
  3.50625518e-02 -0.00000000e+00  2.95573053e-02  2.52422158e-02
  0.00000000e+00 -6.09974299e-03 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00  0.00000000e+00  0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  0.00000000e+00  0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  2.38444955e-02 -0.00000000e+00  1.27350776e-02
 -4.58944540e-03  1.71756911e-02 -3.45962960e-02  1.77333474e-02
 -0.00000000e+00  1.89492038e-02  0.00000000e+00 -0.00000000e+00
  3.39287454e-03  3.14433223e-03 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  3.71415452e-02  9.23278437e-02
  0.00000000e+00  1.01567566e-01 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00  1.73721087e-02
 -6.04289066e-02  7.95711242e-03 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00  0.00000000e+00 -3.93477383e-03
  