In [53]:
# Imports

import pandas as pd
import numpy as np
import warnings

from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifierCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sktime.transformations.panel.rocket import MiniRocket
from sktime.datatypes._panel._convert import from_2d_array_to_nested
from sktime.transformations.panel.rocket import Rocket

warnings.filterwarnings('ignore')
pd.set_option("display.max_rows", 100)

In [43]:
# Opening up file containing GWAS data and reading in

file = open("Second_GWAS/Second_GWAS_Add.raw", "r")
lines = file.readlines()
len(lines)

110

In [44]:
# Loading GWAS data into DF in the correct format

columns = lines[0].strip("\n").split(" ")
columns = [n.split("_", 1)[0] for n in columns]

df = pd.DataFrame(columns=columns)
data = []

for line in lines[1:]:
    newRow = line.strip("\n").split(" ")
    for position in range(0, len(newRow)):
        if newRow[position] == "0":
            newRow[position] = "2"
        elif newRow[position] == "2":
            newRow[position] = "0"
        
    data.append(newRow)

df = pd.DataFrame(data, columns=columns)

In [45]:
# Checking to see split between cases and controls

df["PHENOTYPE"].value_counts()

1    55
0    54
Name: PHENOTYPE, dtype: int64

In [None]:
df.shape

In [None]:
df.columns

In [46]:
# Re-encoding phenotype column

#df["PHENOTYPE"] = df["PHENOTYPE"].replace("0","case")
#df["PHENOTYPE"] = df["PHENOTYPE"].replace("1","control")

In [47]:
# Dropping columns and seperating phenotype column
phenotype = df.pop("PHENOTYPE")
dfTest = df.drop(columns=["FID","IID","PAT","MAT","SEX"])

In [48]:
# Replacing missing values in df with most common allele for 

imp = SimpleImputer(missing_values="NA", strategy="most_frequent")
idf = pd.DataFrame(imp.fit_transform(dfTest))
idf.columns = dfTest.columns

In [49]:
# Converting columns to be numberical

dfTest = idf.apply(pd.to_numeric)

In [50]:
# Splitting data into training data and testing data

xTrain, xTtest, yTrain, yTest = train_test_split(dfTest, phenotype, train_size=0.7, random_state=1)

In [None]:
# Finding hyperparameters for RandomForst

bootstrap = [True, False]
maxDepth = list(range(10,110,10))
maxFeatures = ["auto", "sqrt"]
minSamplesLeaf = [1,2,3,4]
minSamplesSplit = [2,3,5]
nEstimators = list(range(100,1100,100))

hyperparameters = dict(bootstrap=bootstrap, max_depth=maxDepth, max_features=maxFeatures, min_samples_leaf=minSamplesLeaf,
                       min_samples_split=minSamplesSplit, n_estimators=nEstimators)

rfd = RandomForestClassifier()

clf = GridSearchCV(rfd, hyperparameters, cv=10)

bestModel = clf.fit(dfTest, phenotype)
print('Best bootstrap:', bestModel.best_estimator_.get_params()['bootstrap'])
print('Best max_depth:', bestModel.best_estimator_.get_params()['max_depth'])
print('Best max_features:', bestModel.best_estimator_.get_params()['max_features'])
print('Best min_samples_leaf:', bestModel.best_estimator_.get_params()['min_samples_leaf'])
print('Best min_samples_split:', bestModel.best_estimator_.get_params()['min_samples_split'])
print('Best n_estimators:', bestModel.best_estimator_.get_params()['n_estimators'])

In [51]:
rfc = RandomForestClassifier()
rfc.fit(xTrain, yTrain)
yPredicted = rfc.predict(xTrain)
print("Training accuracy score: {}".format(accuracy_score(yTrain, yPredicted)))
yPredicted = rfc.predict(xTtest)
print("Testing accuracy score: {}".format(accuracy_score(yTest, yPredicted)))
print(confusion_matrix(yTest, yPredicted))
print(classification_report(yTest, yPredicted))

Training accuracy score: 1.0
Testing accuracy score: 0.5454545454545454
[[10  3]
 [12  8]]
              precision    recall  f1-score   support

           0       0.45      0.77      0.57        13
           1       0.73      0.40      0.52        20

    accuracy                           0.55        33
   macro avg       0.59      0.58      0.54        33
weighted avg       0.62      0.55      0.54        33



In [None]:
# Converting df to array structure

dfTestSeries = from_2d_array_to_nested(dfTest)

In [None]:
# Splitting time series data into training data and testing data

xTrain, xTest, yTrain, yTest = train_test_split(dfTestSeries, phenotype, train_size=0.5, random_state=1)

In [None]:
# Extracting values from training split

yTrain = yTrain.values
yTest = yTest.values

xTrain = xTrain.reset_index()
xTrain = xTrain.drop(columns=["index"])

xTest = xTest.reset_index()
xTest = xTest.drop(columns=["index"])

In [None]:
# Fitting training data to MiniRocket

minirocket = MiniRocket()
minirocket.fit(xTrain)
xTrainTransform = minirocket.transform(xTrain)

In [None]:
# Fitting the transformed data to the ridge classifier

classifier = RidgeClassifierCV(alphas=np.logspace(-3, 3, 10), normalize=True)
classifier.fit(xTrainTransform, yTrain)

In [None]:
# Generating test predictions 

xTestTransform = minirocket.transform(xTest)
yPredict = classifier.predict(xTestTransform)

In [None]:
# Printing results

print("Accuracy with Rocket: %2.3f" % accuracy_score(yTest, yPredict))
print(confusion_matrix(yTest, yPredict))