In [1]:
from xgboost import XGBClassifier
import pandas as pd
import sklearn.ensemble
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, precision_score, matthews_corrcoef, confusion_matrix
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import statistics
import numpy as np

In [2]:
df = pd.read_csv("data/4-featurized_introns_data.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand,class,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score,...,repeat_features:Type II Transposons,repeat_features:Unknown,repeat_features:Tandem repeats,repeat_features:Satellite repeats,repeat_features:Type I Transposons/LINE,repeat_features:RNA repeats,repeat_features:Low complexity regions,repeat_features:Centromere,repeat_features:Dust,repeat_features:Type I Transposons/SINE
0,chr1,12227,12612,+,1,1122:3199,583:1460,9374:54492,GT:AG,59151,...,0,0,0,0,0,0,0,0,0,0
1,chr1,12721,13220,+,1,1791:3198,783:1104,14048:56719,GT:AG,61021,...,0,0,0,0,0,0,0,0,0,0
2,chr1,12057,12178,+,1,,,,,-1,...,0,0,0,0,0,0,0,0,0,0
3,chr1,12697,12974,+,1,1:1,1:1,15:19,GT:AG,21,...,0,0,0,0,0,0,0,0,0,0
4,chr1,13052,13220,+,1,22:24,17:18,433:484,GC:AG,526,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,,,,,-1,...,0,0,1,0,0,0,0,0,1,1
517421,chrX,151409210,151456968,+,0,,,,,-1,...,1,1,1,0,1,0,1,0,1,1
517422,chrX,153906577,153906694,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,0,0
517423,chrX,153906409,153906520,-,0,,,,,-1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
df = df.drop(columns=["GTExv2", "TCGAv2", "SRAv3h"])
df

Unnamed: 0,chr,start,end,strand,class,RC3-Splice_site,RC3-Score,repeat_features:Simple repeats,repeat_features:LTRs,repeat_features:Type II Transposons,repeat_features:Unknown,repeat_features:Tandem repeats,repeat_features:Satellite repeats,repeat_features:Type I Transposons/LINE,repeat_features:RNA repeats,repeat_features:Low complexity regions,repeat_features:Centromere,repeat_features:Dust,repeat_features:Type I Transposons/SINE
0,chr1,12227,12612,+,1,GT:AG,59151,0,0,0,0,0,0,0,0,0,0,0,0
1,chr1,12721,13220,+,1,GT:AG,61021,0,0,0,0,0,0,0,0,0,0,0,0
2,chr1,12057,12178,+,1,,-1,0,0,0,0,0,0,0,0,0,0,0,0
3,chr1,12697,12974,+,1,GT:AG,21,0,0,0,0,0,0,0,0,0,0,0,0
4,chr1,13052,13220,+,1,GC:AG,526,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,,-1,0,0,0,0,1,0,0,0,0,0,1,1
517421,chrX,151409210,151456968,+,0,,-1,1,1,1,1,1,0,1,0,1,0,1,1
517422,chrX,153906577,153906694,-,0,,-1,0,0,0,0,0,0,0,0,0,0,0,0
517423,chrX,153906409,153906520,-,0,,-1,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:

from sklearn import preprocessing

le = preprocessing.LabelEncoder()
le.fit(df["RC3-Splice_site"])
df["RC3-Splice_site"] = le.transform(df["RC3-Splice_site"])

df

Unnamed: 0,chr,start,end,strand,class,RC3-Splice_site,RC3-Score,repeat_features:Simple repeats,repeat_features:LTRs,repeat_features:Type II Transposons,repeat_features:Unknown,repeat_features:Tandem repeats,repeat_features:Satellite repeats,repeat_features:Type I Transposons/LINE,repeat_features:RNA repeats,repeat_features:Low complexity regions,repeat_features:Centromere,repeat_features:Dust,repeat_features:Type I Transposons/SINE
0,chr1,12227,12612,+,1,4,59151,0,0,0,0,0,0,0,0,0,0,0,0
1,chr1,12721,13220,+,1,4,61021,0,0,0,0,0,0,0,0,0,0,0,0
2,chr1,12057,12178,+,1,6,-1,0,0,0,0,0,0,0,0,0,0,0,0
3,chr1,12697,12974,+,1,4,21,0,0,0,0,0,0,0,0,0,0,0,0
4,chr1,13052,13220,+,1,3,526,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,chrX,151403679,151404937,+,0,6,-1,0,0,0,0,1,0,0,0,0,0,1,1
517421,chrX,151409210,151456968,+,0,6,-1,1,1,1,1,1,0,1,0,1,0,1,1
517422,chrX,153906577,153906694,-,0,6,-1,0,0,0,0,0,0,0,0,0,0,0,0
517423,chrX,153906409,153906520,-,0,6,-1,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
df_annot_test_y = df.pop("class")

# Drop the columns that might not be available prior to manual gene annotation or irrelevant to ML
df_annot_test_X_1 = df.drop(["chr", "start", "end", "strand"], axis=1)
df_annot_test_y

0         1
1         1
2         1
3         1
4         1
         ..
517420    0
517421    0
517422    0
517423    0
517424    0
Name: class, Length: 517425, dtype: int64

In [6]:
df_annot_test_X_1

Unnamed: 0,RC3-Splice_site,RC3-Score,repeat_features:Simple repeats,repeat_features:LTRs,repeat_features:Type II Transposons,repeat_features:Unknown,repeat_features:Tandem repeats,repeat_features:Satellite repeats,repeat_features:Type I Transposons/LINE,repeat_features:RNA repeats,repeat_features:Low complexity regions,repeat_features:Centromere,repeat_features:Dust,repeat_features:Type I Transposons/SINE
0,4,59151,0,0,0,0,0,0,0,0,0,0,0,0
1,4,61021,0,0,0,0,0,0,0,0,0,0,0,0
2,6,-1,0,0,0,0,0,0,0,0,0,0,0,0
3,4,21,0,0,0,0,0,0,0,0,0,0,0,0
4,3,526,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
517420,6,-1,0,0,0,0,1,0,0,0,0,0,1,1
517421,6,-1,1,1,1,1,1,0,1,0,1,0,1,1
517422,6,-1,0,0,0,0,0,0,0,0,0,0,0,0
517423,6,-1,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# Split the dataset to 9:1 Train/Val and Test set
from imblearn.under_sampling import RandomUnderSampler

X_train_val, X_test, y_train_val, y_test = train_test_split(df_annot_test_X_1, df_annot_test_y, stratify=df_annot_test_y, test_size=0.1, shuffle=True)

cv_acc_list = []
cv_ba_acc_list = []
cv_rocauc_list = []
cv_precision_list = []
cv_mcc_list = []
cv_specificity_list = []
cv_sensitivity_list = []
model_dict = {}
model_index = 0

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_val, y_train_val)

for train_index, val_index in skf.split(X_train_val, y_train_val):
	# print(train_index)
	# Perform random under-sampling on the majority class
	rus = RandomUnderSampler(random_state=0)
	X_train_resampled, y_train_resampled = rus.fit_resample(X_train_val.iloc[train_index], y_train_val.iloc[train_index])
	#
	# X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
	# y_train, y_val = y_train_val.iloc[train_index].to_numpy().flatten(), y_train_val.iloc[val_index].to_numpy().flatten()
	X_train, X_val = X_train_resampled, X_train_val.iloc[val_index]
	y_train, y_val = y_train_resampled, y_train_val.iloc[val_index].to_numpy().flatten()

	model_dict[model_index] = XGBClassifier(seed=0)
	model_dict[model_index].fit(X_train, y_train)

	y_predict = model_dict[model_index].predict_proba(X_val)
	y_predict = y_predict[:, 1]
	y_predict_class = list(map(round, y_predict))

	test_acc = accuracy_score(y_val, y_predict_class)
	test_rocauc = roc_auc_score(y_val, y_predict)
	test_bal_acc = balanced_accuracy_score(y_val, y_predict_class)
	test_precision = precision_score(y_val, y_predict_class)  # tp/(tp+fp)
	test_mcc = matthews_corrcoef(y_val, y_predict_class)
	tn, fp, fn, tp = confusion_matrix(y_val, y_predict_class).ravel()

	# Adding the metrics to their list
	cv_acc_list.append(test_acc)
	cv_ba_acc_list.append(test_bal_acc)
	cv_rocauc_list.append(test_rocauc)
	cv_precision_list.append(test_precision)
	cv_mcc_list.append(test_mcc)
	cv_specificity_list.append(tn / (tn + fp))
	cv_sensitivity_list.append(tp / (fn + tp))

In [10]:
print("------------------------------------------------------------------------------------------")
print("Stratified Cross-Validation Performance")
print("------------------------------------------------------------------------------------------")
print("Accuracy: %s \nAUCROC: %s \nMCC: %s \nSensitivity: %s \nSpecificity: %s \nBalanced Accuracy: %s" % (
        statistics.mean(cv_acc_list), statistics.mean(cv_rocauc_list), statistics.mean(cv_mcc_list),
        statistics.mean(cv_sensitivity_list), statistics.mean(cv_specificity_list), statistics.mean(cv_ba_acc_list)))

print("------------------------------------------------------------------------------------------")
print("Accuracy SD: %s \nAUCROC SD: %s \nMCC SD: %s \nSensitivity SD: %s \nSpecificity SD: %s \nBalanced Accuracy SD: %s" % (
        np.std(cv_acc_list), np.std(cv_rocauc_list), np.std(cv_mcc_list),
        np.std(cv_sensitivity_list), np.std(cv_specificity_list), np.std(cv_ba_acc_list)))

------------------------------------------------------------------------------------------
Stratified Cross-Validation Performance
------------------------------------------------------------------------------------------
Accuracy: 0.7724090691429937 
AUCROC: 0.8584908234954587 
MCC: 0.04093768449134987 
Sensitivity: 0.7723540110814827 
Specificity: 0.8429984779299847 
Balanced Accuracy: 0.8076762445057337
------------------------------------------------------------------------------------------
Accuracy SD: 0.0034690007224455564 
AUCROC SD: 0.01277613405010268 
MCC SD: 0.002185282899969737 
Sensitivity SD: 0.0034525482890308988 
Specificity SD: 0.02684884696748335 
Balanced Accuracy SD: 0.015031879472820985
