In [26]:
%matplotlib inline
import pylab as pl
import numpy as np
import pandas as pd

#Read data

X = []
samples_x = []

f = open("genotypes.csv","r")

for i,line in enumerate(f):
    if(i == 0):
       continue
    
    sv = line.strip().split(",")
    samples_x.append(sv[0])
    
    row = []
    for element in sv[1:]:
        row.append(element)
    X.append(row)
    
f.close()

X = np.array(X)
samples_x = np.array(samples_x)

print(samples_x)
print(X)


['9597' '9143' '8967' ... '1954' '9707' '6120']
[['G' 'G' 'G' ... '?' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'A']
 ['A' 'G' 'A' ... 'A' '?' 'A']
 ...
 ['G' 'A' 'G' ... 'T' 'T' 'A']
 ['G' 'G' 'A' ... 'A' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'C']]


In [27]:
#Replace "?" with np.nan

X = np.where(X == "?", np.nan, X)

print(X)

[['G' 'G' 'G' ... 'nan' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'A']
 ['A' 'G' 'A' ... 'A' 'nan' 'A']
 ...
 ['G' 'A' 'G' ... 'T' 'T' 'A']
 ['G' 'G' 'A' ... 'A' 'T' 'A']
 ['G' 'G' 'G' ... 'A' 'T' 'C']]


In [28]:
#Encode nucleotide values

for j in range (X.shape[1]):
    
    values, counts = np.unique(X[:,j], return_counts=True)
    values=values[0:2]
    counts=counts[0:2]

    #print(values[np.argmax(counts)])
    #print(values[np.argmin(counts)])
    #print(values, counts)

    X[:,j] = np.where(X[:,j] == values[np.argmax(counts)], 0.0, X[:,j])
    X[:,j] = np.where(X[:,j] == values[np.argmin(counts)], 2.0, X[:,j])

#Convert array from string to float
X = X.astype(np.float_)

print(X)
    

[[ 0.  0.  0. ... nan  0.  0.]
 [ 0.  0.  0. ...  0.  0.  0.]
 [ 2.  0.  2. ...  0. nan  0.]
 ...
 [ 0.  2.  0. ...  2.  0.  0.]
 [ 0.  0.  2. ...  0.  0.  0.]
 [ 0.  0.  0. ...  0.  0.  2.]]


In [29]:
# General analysis data:

print("No Samples:\t\t %d" % X.shape[0])
print("No Features:\t\t %d" % X.shape[1])

missing = ((np.isnan(X)).sum()/X.size*100)
print("Missing Values:\t\t %.2f%%" % missing)

No Samples:		 1826
No Features:		 5000
Missing Values:		 10.00%


In [30]:
# Assess missing values per sample


sample_indices_to_remove = []
for i in range(X.shape[0]):
    missing = np.isnan(X[i,:]).sum()/X.shape[1]*100
    if missing>50:
        print("Sample %d has %.2f missing values" % (i,missing))
        sample_indices_to_remove.append(i)

if (len(sample_indices_to_remove) == 0):
    print("All samples have less than 50 % missing values")

All samples have less than 50 %% missing values


In [31]:
# Read phenotype values

y = []
samples_y = []

f = open("phenotype_values.csv","r")

for i,line in enumerate(f):
    if(i == 0):
       continue
    sv = line.strip().split(",")
    samples_y.append(sv[0])
    y.append(float(sv[1]))
    
f.close()

y = np.array(y)
samples_y = np.array(samples_y)

print (samples_y)
print (y)

['9608' '9801' '9647' ... '9894' '6200' '9917']
[53.43 58.81 53.57 ... 53.68 54.62 53.35]


In [32]:
# Match genotypes with phenotype values
truth_table = (samples_x[:,np.newaxis]==samples_y)
ind = np.where(truth_table==True)

samples_x = samples_x[ind[0]]
samples_y = samples_y[ind[1]]

X = X[ind[0],:]
y = y[ind[1]]

print("No Samples X:\t\t%d" % X.shape[0])
print("No Samples y:\t\t%d" % y.shape[0])

print("Samples correct order:\t" + str(np.any(samples_x==samples_y)))


No Samples X:		1826
No Samples y:		1826
Samples correct order:	True


In [33]:
# Split into train data and test data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

print("Full Data:\t" + str(X.shape))
print("Train Data:\t" + str(X_train.shape))
print("Test Data:\t" + str(X_test.shape))



Full Data:	(1826, 5000)
Train Data:	(1460, 5000)
Test Data:	(366, 5000)


In [38]:
# Impute missing values on training data

from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="most_frequent")

X_train = imputer.fit_transform(X_train) 

print(X_train)


[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 2. 0.]
 [2. 0. 0. ... 2. 2. 0.]
 ...
 [0. 0. 0. ... 2. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 2. ... 0. 0. 0.]]


In [35]:
# Impute missing values on test data based on training data

X_test = imputer.transform(X_test)
X_test = imputer.transform(X_test)

print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [2. 0. 2. ... 2. 0. 2.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 2. 2. ... 0. 0. 0.]]


In [36]:
# Linear Regression

import sklearn.metrics as metrics
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
lr.fit(X_train, y_train)
predictions_training = lr.predict(X_train)
predictions_testing = lr.predict(X_test)

print("MSE (Train):\t%.2f" % metrics.mean_squared_error(y_train, predictions_training))
print("R2 (Train):\t%.2f" % metrics.r2_score(y_train, predictions_training))
print("MSE (Test):\t%.2f" % metrics.mean_squared_error(y_test, predictions_testing))
print("R2 (Test):\t%.2f" % metrics.r2_score(y_test, predictions_testing))

MSE (Train):	0.00
R2 (Train):	1.00
MSE (Test):	5.28
R2 (Test):	0.54
