In [124]:
# Imports

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.naive_bayes import GaussianNB
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn import tree

from sklearn import metrics
from sklearn.model_selection import train_test_split

from sktime.classification.shapelet_based import MrSEQLClassifier
from sktime.datasets import load_arrow_head, load_basic_motions

from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sktime.transformations.panel.rocket import Rocket
from sklearn.linear_model import RidgeClassifierCV

from sktime.datasets import load_arrow_head 

In [125]:
# Opening up file containing GWAS data and reading in

file = open("GWAS_Add.raw", "r")
lines = file.readlines()
len(lines)

90

In [126]:
# Loading GWAS data into DF in the correct format

columns = lines[0].strip("\n").split(" ")
columns = [n.split("_", 1)[0] for n in columns]
df = pd.DataFrame(columns=columns)

data = []
for line in lines[1:]:
    newRow = line.strip("\n").split(" ")
    data.append(newRow)

df = pd.DataFrame(data, columns=columns)

In [127]:
# Only selecting SNPs located on chromosome 2 

dfAlleles = df.loc[:, "rs1947487":"rs10496679"]
dfInfo = df.loc[:, :"PHENOTYPE"]
df = dfInfo.join(dfAlleles)
df["PHENOTYPE"]

0     1
1     1
2     2
3     1
4     1
     ..
84    1
85    2
86    2
87    2
88    2
Name: PHENOTYPE, Length: 89, dtype: object

In [128]:
# Re-encoding SNP of interest

df.loc[df.PHENOTYPE == 2, ['rs2222162']] = 2
df.loc[df.PHENOTYPE == 1, ['rs2222162']] = 0

In [129]:
df

Unnamed: 0,FID,IID,PAT,MAT,SEX,PHENOTYPE,rs1947487,rs4375873,rs729446,rs360233,...,rs2222902,rs1251207,rs837894,rs837855,rs755570,rs837884,rs944907,rs6760375,rs4662934,rs10496679
0,HCB181,1,0,0,1,1,0,0,0,0,...,0,0,0,1,0,1,1,0,1,1
1,HCB182,1,0,0,1,1,0,0,0,0,...,2,1,1,1,0,2,0,0,0,0
2,HCB183,1,0,0,1,2,0,0,0,0,...,1,1,2,0,0,2,0,0,0,0
3,HCB184,1,0,0,1,1,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
4,HCB185,1,0,0,1,1,0,0,0,0,...,2,0,0,2,0,1,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84,JPT265,1,0,0,1,1,0,0,0,0,...,1,0,2,0,1,0,1,0,1,1
85,JPT266,1,0,0,1,2,0,0,0,0,...,0,0,1,1,0,2,0,0,0,0
86,JPT267,1,0,0,1,2,1,1,0,1,...,2,1,0,2,1,2,0,0,0,0
87,JPT268,1,0,0,1,2,1,1,0,0,...,0,0,1,1,0,1,0,0,1,1


In [130]:
# Removing ID columns and 

phenotype = df.pop("PHENOTYPE")
dfTest = df.drop(columns=["FID","IID","PAT","MAT","SEX"])

In [131]:
# Replacing missing SNP entries and replacing with most frequent value for that SNP of other entries

imp = SimpleImputer(missing_values="NA", strategy='most_frequent')
idf = pd.DataFrame(imp.fit_transform(dfTest))

In [132]:
# Splitting data into training data and testing data

xTrain, xTtest, yTrain, yTest = train_test_split(idf, phenotype, train_size=0.6, random_state=1)

In [133]:
# Classifying GWAS data using decision tree and printing results

dtc = DecisionTreeClassifier(criterion='entropy', max_depth=3)
dtc.fit(xTrain, yTrain)
yPredicted = dtc.predict(xTrain)
print("Training accuracy score: {}".format(accuracy_score(yTrain, yPredicted)))
yPredicted = dtc.predict(xTtest)
print("Testing accuracy score: {}".format(accuracy_score(yTest, yPredicted)))

Training accuracy score: 0.8490566037735849
Testing accuracy score: 0.5555555555555556


In [134]:
# Classifying GWAS data using KNN and printing results

knnClf = KNeighborsClassifier() 
knnClf.fit(xTrain, yTrain)
yPredicted = knnClf.predict(xTrain)
print("Training accuracy score: {}".format(accuracy_score(yTrain, yPredicted)))
yPredicted = knnClf.predict(xTtest)
print("Testing accuracy score: {}".format(accuracy_score(yTest, yPredicted)))

Training accuracy score: 0.7169811320754716
Testing accuracy score: 0.5


In [135]:
# Classifying GWAS data using Gaussian Naive Bayes and printing results

gnbClf = GaussianNB() 
gnbClf.fit(xTrain, yTrain)
yPredicted = gnbClf.predict(xTrain)
print("Training accuracy score: {}".format(accuracy_score(yTrain, yPredicted)))
yPredicted = gnbClf.predict(xTtest)
print("Testing accuracy score: {}".format(accuracy_score(yTest, yPredicted)))

Training accuracy score: 0.5660377358490566
Testing accuracy score: 0.4722222222222222


In [136]:
# Converting df columns into 2d numpy array time series

idf = idf.apply(pd.to_numeric)
idfSeries = from_2d_array_to_nested(idf)

In [137]:
# Splitting time seriesdata into training data and testing data

xTrain, xTest, yTrain, y_test = train_test_split(idfSeries, phenotype, train_size=0.6, random_state=1)

In [138]:
# Extracting values from training split
yTrain = yTrain.values
y_test = y_test.values

xTrain = xTrain.reset_index()
xTrain = xTrain.drop(columns=['index'])

xTest = xTest.reset_index()
xTest = xTest.drop(columns=['index'])

In [139]:
# Running rocket kernal transformation on training data

rocket = Rocket()
rocket.fit(xTrain)
xTrainTransform = rocket.transform(xTrain)

In [140]:
# Classifying GWAS Data from rocket transformation using Ridge Classifier

classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier.fit(xTrainTransform, yTrain)
xTestTransform = rocket.transform(xTest)
classifier.score(xTestTransform, yTest)

0.5555555555555556

In [141]:
# Classifying GWAS data using MrSEQL
ms = MrSEQLClassifier(seql_mode="clf")
ms.fit(xTrain, yTrain)
predicted = ms.predict(xTest)
print("Accuracy with mr-seql: %2.3f" % metrics.accuracy_score(yTest, predicted))

  ms = MrSEQLClassifier(seql_mode="clf")
  ms = MrSEQLClassifier(seql_mode="clf")


Accuracy with mr-seql: 0.417
