# Model Genotypes

Model the genotypes matrix produced in GetSNPMatrix.ipynb

In [48]:
import sgkit as sg
from sgkit.io import plink
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import pandas as pd
import numpy as np
import dask
import pickle
import xarray as xr

In [2]:
ds = plink.read_plink(bed_path = 'ratgenes_pruned/ratgenes_pruned.bed', bim_path = 'ratgenes_pruned/ratgenes_pruned.bim', fam_path = 'ratgenes_pruned/ratgenes_pruned.fam')
ds = ds.set_index({"samples": "sample_id"})
ds = ds.set_index({"variants": "variant_id"})
call_g_mask = ds["call_genotype_mask"].any(dim = "ploidy")
call_g = xr.where(call_g_mask, -1, ds["call_genotype"].sum(dim = "ploidy"))
genotypes_matrix = call_g.values
genotypes_matrix = np.transpose(genotypes_matrix)
genotypes_matrix.shape

(13526, 96654)

In [3]:
genotypes_matrix[:5]

array([[ 2,  0,  0, ...,  2,  2,  2],
       [ 0,  2,  2, ...,  1,  1,  1],
       [-1,  1,  1, ...,  2,  2,  2],
       [ 1,  1,  1, ...,  0,  0,  0],
       [-1,  0,  0, ...,  2,  2,  2]])

## Train a PCA to dimensionality reduce to 230 principle components

In [4]:
pca = PCA(n_components = 230, svd_solver = 'randomized', random_state = 42)
pca.fit(genotypes_matrix)

PCA(n_components=230, random_state=42, svd_solver='randomized')

In [5]:
sum(pca.explained_variance_ratio_)

0.49436604670396506

In [6]:
genotypes_new = pca.transform(genotypes_matrix)

In [7]:
genotypes_new.shape

(13526, 230)

## Load in 'loco' phenotypes dataset, filter accordingly

In [21]:
loco = pd.read_csv("pheno_loco_clean.txt", sep = '\t')
loco.head()

Unnamed: 0,rfid,loco_maxcent,loco_cent16_5,loco_cent13_5,loco_cent10_5,loco_cent7_5,loco_cent4_5,loco_cent1_5,loco_maxrear,loco_rear16_5,...,loco_act13_5,loco_act10_5,loco_act7_5,loco_act4_5,loco_act1_5,loco_center,loco_rear,loco_distance,loco_activity,loco_age
0,00077E61F3,171.3,171.3,59.4,83.9,84.8,55.0,49.7,19.0,17.0,...,81.0,79.0,78.0,85.0,109.0,504.1,86.0,2367.0,470.0,73.0
1,00077E6207,115.4,40.8,115.4,12.4,7.9,80.9,45.5,26.0,10.0,...,65.0,54.0,74.0,100.0,135.0,302.9,69.0,2407.0,477.0,73.0
2,00077E6232,48.1,32.9,28.0,48.1,33.1,37.2,29.0,20.0,10.0,...,73.0,81.0,86.0,94.0,124.0,208.3,66.0,2690.0,527.0,77.0
3,00077E6239,147.1,128.2,88.7,147.1,118.0,127.8,109.0,25.0,0.0,...,25.0,65.0,72.0,85.0,97.0,718.8,55.0,1536.0,371.0,81.0
4,00077E62D2,89.4,0.0,0.0,0.0,7.4,89.4,12.0,22.0,0.0,...,29.0,42.0,59.0,88.0,96.0,108.8,40.0,1775.0,352.0,77.0


In [22]:
print(loco.columns)

Index(['rfid', 'loco_maxcent', 'loco_cent16_5', 'loco_cent13_5',
       'loco_cent10_5', 'loco_cent7_5', 'loco_cent4_5', 'loco_cent1_5',
       'loco_maxrear', 'loco_rear16_5', 'loco_rear13_5', 'loco_rear10_5',
       'loco_rear7_5', 'loco_rear4_5', 'loco_rear1_5', 'loco_maxdis',
       'loco_dis16_5', 'loco_dis13_5', 'loco_dis10_5', 'loco_dis7_5',
       'loco_dis4_5', 'loco_dis1_5', 'loco_maxact', 'loco_act16_5',
       'loco_act13_5', 'loco_act10_5', 'loco_act7_5', 'loco_act4_5',
       'loco_act1_5', 'loco_center', 'loco_rear', 'loco_distance',
       'loco_activity', 'loco_age'],
      dtype='object')


### Only use the 'max' loco phenotypes

In [23]:
pheno_select = ['rfid', 'loco_maxcent', 'loco_maxdis', 'loco_maxrear', 'loco_maxact']

In [24]:
loco_select = loco[pheno_select]
loco_select = loco_select.set_index('rfid')

In [25]:
loco_select.head()

Unnamed: 0_level_0,loco_maxcent,loco_maxdis,loco_maxrear,loco_maxact
rfid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
00077E61F3,171.3,568.0,19.0,109.0
00077E6207,115.4,697.0,26.0,135.0
00077E6232,48.1,652.0,20.0,124.0
00077E6239,147.1,466.0,25.0,97.0
00077E62D2,89.4,599.0,22.0,96.0


### Filter genotypes_new to only have rats in the loco phenotypes dataset

In [26]:
rat_ids = ds["samples"].values
genotypes_new_df = pd.DataFrame(data = genotypes_new, index = rat_ids)
genotypes_new_df.index.name = 'rfid'
genotypes_new_df.head(10)

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,220,221,222,223,224,225,226,227,228,229
rfid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00077E7A10,21.709824,12.072828,-14.469518,10.050205,12.191051,25.037595,-4.342536,3.936908,-0.570887,3.893769,...,-1.01127,13.147396,-5.451745,10.405451,-5.676592,-6.532274,-7.261003,-1.21777,1.828908,-3.812544
00077E7AB7,10.251807,-20.152089,12.094699,32.042794,35.683056,4.597834,4.988165,1.998382,5.51107,19.087811,...,-4.685307,1.948354,-2.32117,1.964835,-1.139532,11.669304,-2.024051,-0.691334,6.93191,3.262146
00077E9536,15.351759,-3.47692,-14.406917,17.25906,-10.742299,19.919575,3.433845,7.161555,14.531464,-10.262331,...,1.911253,-1.141637,-2.067857,-1.84389,-9.586,-1.204874,6.48774,-5.035391,3.982582,0.055638
00077E9B37,8.834164,-17.780039,9.199224,12.695393,8.417267,22.016319,-6.350907,7.009582,-3.883824,-24.632733,...,-2.834063,10.35942,6.056744,1.970687,-1.481837,-2.343528,-1.500875,1.351971,0.085634,2.519491
00077EA639,9.803076,-7.746949,4.268591,-23.002719,19.123769,-16.657941,0.266084,26.214961,-18.288304,-15.74396,...,-2.764159,8.161467,3.382217,-6.04389,-7.473324,-11.946652,-6.862979,-9.360968,5.631419,-0.069384
00077EAA6C,0.951847,-39.84808,8.224686,29.325067,-22.573166,-16.900857,2.526914,-7.386044,1.414387,9.050317,...,0.437416,-2.656478,-5.185582,5.520276,-0.254113,5.486362,-7.230508,0.962354,4.592058,-4.211326
00077EAF65,-4.694665,-32.266892,11.586474,-30.461401,22.723234,-3.508425,3.701445,13.203889,2.39444,-0.061927,...,3.097625,-7.556109,-4.634019,13.587611,-1.706229,14.081353,-6.685088,-1.12316,-2.315727,-0.304997
00078A1795,16.207063,-15.204335,6.120047,-20.563964,-21.978036,18.227382,4.015749,0.091782,-10.522078,-0.404186,...,0.023156,1.919849,-1.308717,2.311879,0.063058,3.24869,5.004681,-0.103311,9.366129,5.51347
933000120117306,-6.191658,-4.203107,11.725222,25.356851,-21.140272,-21.431007,-10.121674,-8.580559,1.036483,12.148989,...,-0.936276,1.992044,-1.562111,0.717499,-14.37692,-2.026891,-1.891763,-14.290164,-10.651457,-2.432428
933000120117307,7.090934,-18.017996,-31.194238,2.288079,-10.316459,8.467924,-3.626839,-8.825813,-15.588856,-0.204784,...,13.217992,0.939948,-5.242698,2.577477,3.964434,9.601199,8.753398,-1.162443,-2.275937,-2.975999


In [27]:
geno_with_pheno = pd.merge(genotypes_new_df, loco_select, left_index=True, right_index=True)

In [30]:
print(geno_with_pheno.shape)
geno_with_pheno.head()

(2368, 234)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,224,225,226,227,228,229,loco_maxcent,loco_maxdis,loco_maxrear,loco_maxact
rfid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00077E61F3,12.132373,-27.873886,-14.636472,-26.720286,26.233786,3.293454,30.755103,7.669595,13.660288,15.109918,...,-0.14111,-1.339633,-1.669866,1.930249,-0.369967,10.188985,171.3,568.0,19.0,109.0
00077E6207,24.160556,-3.36057,-16.108081,-9.779261,6.075024,4.60952,5.747501,22.148976,-8.586358,41.392904,...,-3.290691,4.556553,-2.546501,10.478027,4.803143,2.155855,115.4,697.0,26.0,135.0
00077E6232,-6.942274,-32.741051,9.209822,-1.429617,1.62343,2.734864,-2.24812,12.853267,19.937163,39.250381,...,7.420198,-2.230633,-4.45979,3.744106,-1.72106,-4.232724,48.1,652.0,20.0,124.0
00077E6239,37.44749,10.638265,21.291678,-31.275648,-9.963302,14.1888,10.224169,-18.771409,2.581385,-9.787773,...,1.767202,-1.18313,3.834784,2.807185,-0.567963,-1.476755,147.1,466.0,25.0,97.0
00077E62D2,46.597835,-1.252157,14.851641,18.114037,-5.990133,-3.146445,-18.604235,-11.121621,-5.681867,-1.117822,...,-2.86128,1.916741,-9.818025,13.642586,6.387523,-12.712059,89.4,599.0,22.0,96.0


In [31]:
# Split the geno_with_pheno into X and y datasets

phenos = ['loco_maxcent', 'loco_maxdis', 'loco_maxrear', 'loco_maxact']
y_pheno = geno_with_pheno[phenos]
y_pheno = y_pheno.to_numpy()

X_geno = geno_with_pheno.drop(columns = phenos)
X_geno = X_geno.to_numpy()

In [33]:
print(X_geno[:5])
print(y_pheno[:5])

[[ 12.13237294 -27.87388605 -14.63647192 ...   1.93024852  -0.36996715
   10.18898483]
 [ 24.1605559   -3.36057013 -16.10808058 ...  10.47802697   4.80314287
    2.1558552 ]
 [ -6.94227425 -32.74105067   9.20982153 ...   3.74410579  -1.7210603
   -4.23272375]
 [ 37.44748968  10.63826537  21.29167791 ...   2.80718509  -0.56796325
   -1.47675537]
 [ 46.59783525  -1.25215659  14.85164131 ...  13.64258641   6.38752332
  -12.71205882]]
[[171.3 568.   19.  109. ]
 [115.4 697.   26.  135. ]
 [ 48.1 652.   20.  124. ]
 [147.1 466.   25.   97. ]
 [ 89.4 599.   22.   96. ]]


## Train and Test Random Forrest Model to Predict loco phenotypes from genotypes

### Train Model

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_geno, y_pheno, test_size=0.20, random_state=42)

In [36]:
regr = RandomForestRegressor(n_jobs = -1, random_state = 42)
regr.fit(X_train, y_train)

RandomForestRegressor(n_jobs=-1, random_state=42)

### Create Predictions from Train and Test

In [37]:
y_train_pred = regr.predict(X_train)
y_train_pred[:5]

array([[ 61.207, 628.01 ,  32.85 , 104.97 ],
       [ 47.188, 705.82 ,  32.91 ,  99.99 ],
       [ 52.488, 732.09 ,  38.72 ,  88.72 ],
       [133.301, 620.91 ,  32.03 ,  92.86 ],
       [ 42.676, 733.8  ,  35.51 ,  95.67 ]])

In [39]:
y_test_pred = regr.predict(X_test)
y_test_pred[:5]

array([[ 73.759, 622.74 ,  30.59 ,  89.72 ],
       [ 66.577, 614.11 ,  31.13 ,  89.01 ],
       [ 65.613, 608.94 ,  30.27 ,  89.51 ],
       [ 71.465, 626.54 ,  31.54 ,  90.48 ],
       [ 70.697, 620.41 ,  30.12 ,  91.01 ]])

### Evaluate Performance of the Model

(1) MAE on the training set

In [41]:
print(mean_absolute_error(y_train, y_train_pred, multioutput='raw_values'))

[11.65828353 30.09721225  2.14914467  5.5874868 ]


(2) MAE on the test set

In [42]:
print(mean_absolute_error(y_test, y_test_pred, multioutput='raw_values'))

[33.61259705 84.64936709  5.46976793 14.64696203]


(3) MSE on the training set

In [43]:
print(mean_squared_error(y_train, y_train_pred, multioutput='raw_values'))

[ 218.95067334 1471.96929155    7.48497149   50.50932841]


(4) MSE on the test set

In [44]:
print(mean_squared_error(y_test, y_test_pred, multioutput='raw_values'))

[ 1802.66153938 11347.82035992    46.65766941   334.51584388]


(5) R2 Scores on training set

In [45]:
print(r2_score(y_train, y_train_pred, multioutput = 'raw_values'))

[0.85837491 0.86298305 0.85766716 0.86150008]


(6) R2 scores on test set

In [46]:
print(r2_score(y_test, y_test_pred, multioutput = 'raw_values'))

[-0.00217703  0.01287462 -0.01732371  0.01998775]


## Train and Test Linear Regession Model to Predict loco phenotypes from genotypes

### Train

In [49]:
reg = LinearRegression(n_jobs = -1).fit(X_train, y_train)

In [50]:
y_train_pred = reg.predict(X_train)
y_train_pred[:5]

array([[ 77.02446998, 634.55461685,  32.27292208,  96.23084067],
       [ 71.3899733 , 690.17829955,  32.44017978,  97.29991705],
       [ 63.70563575, 659.63216348,  31.56017019,  91.18909365],
       [ 94.38500214, 644.13875545,  36.52573319,  99.07705304],
       [ 37.21233347, 739.85678578,  32.32098624,  93.87478719]])

In [51]:
y_test_pred = reg.predict(X_test)
y_test_pred[:5]

array([[ 79.28846957, 640.14014722,  33.38259325,  84.40187856],
       [ 61.85934165, 600.71578966,  30.36547851,  89.45831308],
       [ 78.94165537, 605.78865075,  25.32390345,  96.35555927],
       [ 79.8104754 , 610.02809602,  32.24912475,  86.0270392 ],
       [ 75.57745524, 564.95952982,  31.3779924 ,  94.17122312]])

In [53]:
print("MAE Train:", mean_absolute_error(y_train, y_train_pred, multioutput='raw_values'))
print("MAE Test:", mean_absolute_error(y_test, y_test_pred, multioutput='raw_values'))
print("MSE Train:", mean_squared_error(y_train, y_train_pred, multioutput='raw_values'))
print("MSE Test:", mean_squared_error(y_test, y_test_pred, multioutput='raw_values'))
print("R2 Train:", r2_score(y_train, y_train_pred, multioutput = 'raw_values'))
print("R2 Test:", r2_score(y_test, y_test_pred, multioutput = 'raw_values'))

MAE Train: [28.36016736 72.01321446  5.22939301 13.55840803]
MAE Test: [34.37543513 84.83794233  5.60771477 15.48963802]
MSE Train: [1305.60057129 8215.95806867   43.50531684  290.76037606]
MSE Test: [ 1954.10209088 11559.204642      50.82466153   362.93474862]
R2 Train: [0.15549106 0.23522484 0.17271093 0.20271583]
R2 Test: [-0.08636934 -0.0055133  -0.10818079 -0.06326952]


### Overall, both of the models suck. Random Forrest went berserk with overfitting and linear regression didn't learn shit.