# Summary
This Notebook record the analysis of the gene annotation test dataset, we first perform some preprocessing of the dataset including doing label encoding and one-hot encoding of different features, we then splitted the dataset to a 9:1 train-val to test ratio.

We evaluated the performance of the XGBoost classifier model with two set of features:
(1) Original features with preprocessing
(2) Same as above, and we matched the coords to the sequence and added four features representing the sequence's A,C,G,T content proportion %

The result shows that this additional feature is not very effective, however since we have already matched the coordinates to the sequence, we can perform other feature engineering techniques on the data.


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, precision_score, matthews_corrcoef, confusion_matrix
from xgboost import XGBClassifier
import numpy as np
import statistics
from Bio.Seq import Seq
from tqdm import tqdm
from pyfaidx import Fasta

In [3]:
np.random.seed(0)

In [4]:
df_source_annot_test = pd.read_csv("gene_annot_test.tsv", sep="\t")

In [5]:
df_annot_test = df_source_annot_test.copy()

### We can notice that, apart from intron_sources, most of the features can be transformed to binary using Sklearn's LabelEncoder.
We can see from the unique labels of intron_sources that there are only 3 types

In [6]:
df_annot_test.intron_sources.unique()

array(['SLR,CLS', 'CLS', 'SLR', 'SLR,RAC', 'SLR,CLS,RAC', 'RAC',
       'CLS,RAC'], dtype=object)

In [7]:
df_annot_test["intron_sources_SLR"] = df_annot_test["intron_sources"].str.contains("SLR").astype(int)
df_annot_test["intron_sources_CLS"] = df_annot_test["intron_sources"].str.contains("CLS").astype(int)
df_annot_test["intron_sources_RAC"] = df_annot_test["intron_sources"].str.contains("RAC").astype(int)
df_annot_test

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,...,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k,intron_sources_SLR,intron_sources_CLS,intron_sources_RAC
0,chr1:261635-267302:-1,accepted,888,5668,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,...,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,0.311999,0.311999,1,1,0
1,chr1:259026-261549:-1,accepted,650,2524,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,...,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,-0.311999,-0.311999,1,1,0
2,chr1:732208-739802:-1,rejected,0,7595,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,...,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.708050,0,1,0
3,chr1:720201-732016:-1,accepted,0,11816,yes,PacBio Capture-seq,CLS,GT..AG,Type II Transposons,no,...,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.931194,0,1,0
4,chr1:711923-720031:-1,accepted,27,8109,yes,PacBio Capture-seq,CLS,GT..AG,No overlap,no,...,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,1.216395,0.810930,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,chrY:20582694-20584473:1,accepted,278936,1780,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,...,no,no,OTTHUMT00000500440,protein_coding,coding,-0.141738,-0.364851,1,1,1
11006,chrY:20584525-20588023:1,accepted,286043,3499,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,...,no,no,OTTHUMT00000500440,protein_coding,coding,-0.112146,-0.335259,1,1,1
11007,chrY:20588106-20589483:1,accepted,444721,1378,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,...,no,no,OTTHUMT00000500440,protein_coding,coding,0.433606,0.210496,1,1,1
11008,chrY:20589576-20592340:1,accepted,468983,2765,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,...,no,no,OTTHUMT00000500440,protein_coding,coding,0.503702,0.280593,1,1,1


In [8]:
df_annot_test.splice_site.unique()

array(['GT..AG', 'GC..AG', 'AT..AC', 'GT..GG'], dtype=object)

In [9]:
for splice_site_value in df_annot_test.splice_site.unique():
    df_annot_test[splice_site_value] = df_annot_test["splice_site"].str.contains(str(splice_site_value)).astype(int)

df_annot_test.iloc[:, -4:]

Unnamed: 0,GT..AG,GC..AG,AT..AC,GT..GG
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,1,0,0,0
...,...,...,...,...
11005,1,0,0,0
11006,1,0,0,0
11007,1,0,0,0
11008,1,0,0,0


### We convert "accepted" and "rejected" to 1s and 0s, this facilitates the calculation of ROC curve

In [10]:
for col in df_annot_test.drop(["coords"], axis=1).columns:
    le = preprocessing.LabelEncoder()
    le.fit(df_annot_test[col])
    df_annot_test[col] = le.transform(df_annot_test[col])

df_annot_test

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,...,bbiotype,rel_int_sup,rel_int_sup_k,intron_sources_SLR,intron_sources_CLS,intron_sources_RAC,GT..AG,GC..AG,AT..AC,GT..GG
0,chr1:261635-267302:-1,0,696,3524,1,3,4,2,9,0,...,1,7185,7199,1,1,0,1,0,0,0
1,chr1:259026-261549:-1,0,558,2130,1,3,4,2,9,0,...,1,3479,3283,1,1,0,1,0,0,0
2,chr1:732208-739802:-1,1,0,4005,0,1,0,2,20,0,...,1,1629,1496,0,1,0,1,0,0,0
3,chr1:720201-732016:-1,0,0,4609,1,1,0,2,22,0,...,1,1629,1399,0,1,0,1,0,0,0
4,chr1:711923-720031:-1,0,26,4092,1,1,0,2,2,0,...,1,9330,8260,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,chrY:20582694-20584473:1,0,5548,1593,1,1,5,2,2,0,...,0,4092,3128,1,1,1,1,0,0,0
11006,chrY:20584525-20588023:1,0,5595,2703,1,1,5,2,2,0,...,0,4275,3207,1,1,1,1,0,0,0
11007,chrY:20588106-20589483:1,0,6348,1256,1,1,5,2,2,0,...,0,7867,6679,1,1,1,1,0,0,0
11008,chrY:20589576-20592340:1,0,6428,2279,1,1,5,2,2,0,...,0,8162,7043,1,1,1,1,0,0,0


### We then select all the relevant column(s) for the input features and the label

In [11]:
df_annot_test_y = df_annot_test["outcome"]

# Drop the columns that might not be available prior to manual gene annotation or irrelevant to ML
df_annot_test_X_1 = df_annot_test.drop(["coords", "splice_site", "outcome", "transcript_id", "gtype", "rej_reason"], axis=1)
df_annot_test_X_1

Unnamed: 0,score,length,prev_annot,transcript_source,intron_sources,repeat_overlap,ss_antisense,annot_match,incorrect_locus,opp_strand,...,bbiotype,rel_int_sup,rel_int_sup_k,intron_sources_SLR,intron_sources_CLS,intron_sources_RAC,GT..AG,GC..AG,AT..AC,GT..GG
0,696,3524,1,3,4,9,0,1,0,0,...,1,7185,7199,1,1,0,1,0,0,0
1,558,2130,1,3,4,9,0,1,0,0,...,1,3479,3283,1,1,0,1,0,0,0
2,0,4005,0,1,0,20,0,0,0,0,...,1,1629,1496,0,1,0,1,0,0,0
3,0,4609,1,1,0,22,0,0,0,0,...,1,1629,1399,0,1,0,1,0,0,0
4,26,4092,1,1,0,2,0,0,0,0,...,1,9330,8260,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,5548,1593,1,1,5,2,0,1,0,0,...,0,4092,3128,1,1,1,1,0,0,0
11006,5595,2703,1,1,5,2,0,1,0,0,...,0,4275,3207,1,1,1,1,0,0,0
11007,6348,1256,1,1,5,2,0,1,0,0,...,0,7867,6679,1,1,1,1,0,0,0
11008,6428,2279,1,1,5,2,0,1,0,0,...,0,8162,7043,1,1,1,1,0,0,0


# 1. We first test the cross-validation performance of this dataset without any feature engineering

In [12]:
# Split the dataset to 9:1 Train/Val and Test set
X_train_val, X_test, y_train_val, y_test = train_test_split(df_annot_test_X_1, df_annot_test_y, stratify=df_annot_test_y, test_size=0.1, shuffle=True)

cv_acc_list = []
cv_ba_acc_list = []
cv_rocauc_list = []
cv_precision_list = []
cv_mcc_list = []
cv_specificity_list = []
cv_sensitivity_list = []
model_dict = {}
model_index = 0

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_val, y_train_val)

for train_index, val_index in skf.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index].to_numpy().flatten(), y_train_val.iloc[val_index].to_numpy().flatten()

    model_dict[model_index] = XGBClassifier(seed=0)
    model_dict[model_index].fit(X_train, y_train)

    y_predict = model_dict[model_index].predict_proba(X_val)
    y_predict = y_predict[:, 1]
    y_predict_class = list(map(round, y_predict))

    test_acc = accuracy_score(y_val, y_predict_class)
    test_rocauc = roc_auc_score(y_val, y_predict)
    test_bal_acc = balanced_accuracy_score(y_val, y_predict_class)
    test_precision = precision_score(y_val, y_predict_class)  # tp/(tp+fp)
    test_mcc = matthews_corrcoef(y_val, y_predict_class)
    tn, fp, fn, tp = confusion_matrix(y_val, y_predict_class).ravel()

    # Adding the metrics to their list
    cv_acc_list.append(test_acc)
    cv_ba_acc_list.append(test_bal_acc)
    cv_rocauc_list.append(test_rocauc)
    cv_precision_list.append(test_precision)
    cv_mcc_list.append(test_mcc)
    cv_specificity_list.append(tn / (tn + fp))
    cv_sensitivity_list.append(tp / (fn + tp))


In [13]:
print("------------------------------------------------------------------------------------------")
print("Stratified Cross-Validation Performance")
print("------------------------------------------------------------------------------------------")
print("Accuracy: %s \nAUCROC: %s \nMCC: %s \nSensitivity: %s \nSpecificity: %s \nBalanced Accuracy: %s" % (
        statistics.mean(cv_acc_list), statistics.mean(cv_rocauc_list), statistics.mean(cv_mcc_list),
        statistics.mean(cv_sensitivity_list), statistics.mean(cv_specificity_list), statistics.mean(cv_ba_acc_list)))

print("------------------------------------------------------------------------------------------")
print("Accuracy SD: %s \nAUCROC SD: %s \nMCC SD: %s \nSensitivity SD: %s \nSpecificity SD: %s \nBalanced Accuracy SD: %s" % (
        np.std(cv_acc_list), np.std(cv_rocauc_list), np.std(cv_mcc_list),
        np.std(cv_sensitivity_list), np.std(cv_specificity_list), np.std(cv_ba_acc_list)))

------------------------------------------------------------------------------------------
Stratified Cross-Validation Performance
------------------------------------------------------------------------------------------
Accuracy: 0.9788068385280753 
AUCROC: 0.9853281790486288 
MCC: 0.6954269179466457 
Sensitivity: 0.6764531654942614 
Specificity: 0.9904619846627417 
Balanced Accuracy: 0.8334575750785016
------------------------------------------------------------------------------------------
Accuracy SD: 0.0034830954740468906 
AUCROC SD: 0.0023925073885625965 
MCC SD: 0.04368363112454355 
Sensitivity SD: 0.03089937489620389 
Specificity SD: 0.003530345148726263 
Balanced Accuracy SD: 0.015419047530015247


In [14]:
# Next we will evaluate the feature importance of the XGBoost model.
def evaluate_feature_importance(df_evaluate_feature_importance, md):
    feature_importance_average = [np.mean(i) for i in zip(*[md.get(model).feature_importances_ for model in md])]
    df_feature_importance = pd.DataFrame()
    df_feature_importance["Property"] = df_evaluate_feature_importance.columns
    df_feature_importance["Average_importance"] = feature_importance_average
    print(df_feature_importance.sort_values(by="Average_importance", ascending=False).head(50))

evaluate_feature_importance(md=model_dict, df_evaluate_feature_importance=df_annot_test_X_1.copy())

              Property  Average_importance
2           prev_annot            0.333333
17              GT..AG            0.165053
9           opp_strand            0.118786
12         rel_int_sup            0.080619
16  intron_sources_RAC            0.048784
6         ss_antisense            0.043987
0                score            0.035042
5       repeat_overlap            0.034310
3    transcript_source            0.029294
15  intron_sources_CLS            0.024114
18              GC..AG            0.019744
13       rel_int_sup_k            0.017865
1               length            0.014713
11            bbiotype            0.012199
4       intron_sources            0.009443
8      incorrect_locus            0.009213
19              AT..AC            0.003502
10       false_ret_int            0.000000
14  intron_sources_SLR            0.000000
7          annot_match            0.000000
20              GT..GG            0.000000


As demonstrated above, the splice site sequence is one of the most important feature of the classification model

# 2. We then test the cross-validation performance of this dataset adding the % of A,C,G,T content

In [15]:
df_annot_test_y = df_annot_test["outcome"]

# Drop the columns that might not be available prior to manual gene annotation or irrelevant to ML
df_annot_test_X_2 = df_annot_test.drop(["outcome", "splice_site", "transcript_id", "gtype", "rej_reason"], axis=1)
df_annot_test_X_2

Unnamed: 0,coords,score,length,prev_annot,transcript_source,intron_sources,repeat_overlap,ss_antisense,annot_match,incorrect_locus,...,bbiotype,rel_int_sup,rel_int_sup_k,intron_sources_SLR,intron_sources_CLS,intron_sources_RAC,GT..AG,GC..AG,AT..AC,GT..GG
0,chr1:261635-267302:-1,696,3524,1,3,4,9,0,1,0,...,1,7185,7199,1,1,0,1,0,0,0
1,chr1:259026-261549:-1,558,2130,1,3,4,9,0,1,0,...,1,3479,3283,1,1,0,1,0,0,0
2,chr1:732208-739802:-1,0,4005,0,1,0,20,0,0,0,...,1,1629,1496,0,1,0,1,0,0,0
3,chr1:720201-732016:-1,0,4609,1,1,0,22,0,0,0,...,1,1629,1399,0,1,0,1,0,0,0
4,chr1:711923-720031:-1,26,4092,1,1,0,2,0,0,0,...,1,9330,8260,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,chrY:20582694-20584473:1,5548,1593,1,1,5,2,0,1,0,...,0,4092,3128,1,1,1,1,0,0,0
11006,chrY:20584525-20588023:1,5595,2703,1,1,5,2,0,1,0,...,0,4275,3207,1,1,1,1,0,0,0
11007,chrY:20588106-20589483:1,6348,1256,1,1,5,2,0,1,0,...,0,7867,6679,1,1,1,1,0,0,0
11008,chrY:20589576-20592340:1,6428,2279,1,1,5,2,0,1,0,...,0,8162,7043,1,1,1,1,0,0,0


In [16]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('./hg38.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start, end, chr: genome[chr][start-1:end]

In [17]:
seq_lst = []

for coords in tqdm(df_annot_test_X_2["coords"]):
    coords_split = coords.split(":")
    chromosome = coords_split[0]
    start = coords_split[1].split("-")[0]
    stop = coords_split[1].split("-")[1]
    strand = coords_split[2]


    # Send the API request and get the response
    seq = str(coords_to_dna(int(start), int(stop), chromosome))

    # If the response was successful (i.e. the status code is 200)

    if strand == "-1": #strand
        seq = Seq(seq)  # Encode the seqeunce into
        seq = seq.reverse_complement()


    seq_lst.append(str(seq))

df_annot_test_X_2["Sequence"] = seq_lst


100%|██████████| 11010/11010 [00:01<00:00, 7735.71it/s]


In [18]:
df_annot_test_X_3 = df_annot_test_X_2.copy()
df_annot_test_X_2.Sequence

0        GTGGGAACACAACATTAATCCAAGAGCAGATCCCTGATCCTATAAA...
1        GTAAGTCATTGTTTTACTCCAGATACAGACACTGTGGTTTTACAAT...
2        GTAAGTTCAGGTAGCTGGGACTGTAGGTATACATGACGATACTTGG...
3        GTGAGTAAGCATGGATTTTGGTATATGCAGAGATGGGGGGCTGGAA...
4        GTAAGAAGCAATAGTTTCTCTTACTATTCTGAGAGCCTTATCATTC...
                               ...                        
11005    GTAGGTGTGTAGGTTACTTTTCAATAAAAATTTGCCGCAAAAAATG...
11006    GTAAAAATAACATTTAAAGTTGTGGTATGTCTGTGTTTAAGCAGTT...
11007    GTAAGATCAAAATGATTTTATCTCCTCATTATTTGATATTAATGTT...
11008    GTAAGTAAATAAACCTAACAGGTATTTTGTTTTATCTATGCATTAT...
11009    GTGTGTGTCACCATGCCTTGCTCCCTCCCTTCCCCTCCCTTCCCCT...
Name: Sequence, Length: 11010, dtype: object

In [19]:
df_annot_test_X_2["A_content"] = (df_annot_test_X_2["Sequence"].str.count("A")) / len(df_annot_test_X_2["Sequence"])
df_annot_test_X_2["C_content"] = (df_annot_test_X_2["Sequence"].str.count("C")) / len(df_annot_test_X_2["Sequence"])
df_annot_test_X_2["G_content"] = (df_annot_test_X_2["Sequence"].str.count("G")) / len(df_annot_test_X_2["Sequence"])
df_annot_test_X_2["T_content"] = (df_annot_test_X_2["Sequence"].str.count("T")) / len(df_annot_test_X_2["Sequence"])


df_annot_test_X_2.G_content

0        0.107902
1        0.042688
2        0.147048
3        0.279201
4        0.130245
           ...   
11005    0.029064
11006    0.055858
11007    0.021163
11008    0.050045
11009    0.165486
Name: G_content, Length: 11010, dtype: float64

In [20]:
df_annot_test_X_2 = df_annot_test_X_2.drop(["coords", "Sequence"], axis=1)

df_annot_test_X_2

Unnamed: 0,score,length,prev_annot,transcript_source,intron_sources,repeat_overlap,ss_antisense,annot_match,incorrect_locus,opp_strand,...,intron_sources_CLS,intron_sources_RAC,GT..AG,GC..AG,AT..AC,GT..GG,A_content,C_content,G_content,T_content
0,696,3524,1,3,4,9,0,1,0,0,...,1,0,1,0,0,0,0.151135,0.098274,0.107902,0.157493
1,558,2130,1,3,4,9,0,1,0,0,...,1,0,1,0,0,0,0.065486,0.046412,0.042688,0.074659
2,0,4005,0,1,0,20,0,0,0,0,...,1,0,1,0,0,0,0.176839,0.156131,0.147048,0.209809
3,0,4609,1,1,0,22,0,0,0,0,...,1,0,1,0,0,0,0.216440,0.316440,0.279201,0.261126
4,26,4092,1,1,0,2,0,0,0,0,...,1,0,1,0,0,0,0.234423,0.120073,0.130245,0.251771
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,5548,1593,1,1,5,2,0,1,0,0,...,1,1,1,0,0,0,0.048411,0.027975,0.029064,0.056222
11006,5595,2703,1,1,5,2,0,1,0,0,...,1,1,1,0,0,0,0.088919,0.051680,0.055858,0.121344
11007,6348,1256,1,1,5,2,0,1,0,0,...,1,1,1,0,0,0,0.038783,0.019800,0.021163,0.045413
11008,6428,2279,1,1,5,2,0,1,0,0,...,1,1,1,0,0,0,0.079927,0.041780,0.050045,0.079382


In [21]:
# This part is the exact same (except using different train_df) as the code used before

# Split the dataset to 9:1 Train/Val and Test set
X_train_val, X_test, y_train_val, y_test = train_test_split(df_annot_test_X_2, df_annot_test_y, stratify=df_annot_test_y, test_size=0.1, shuffle=True)

cv_acc_list = []
cv_ba_acc_list = []
cv_rocauc_list = []
cv_precision_list = []
cv_mcc_list = []
cv_specificity_list = []
cv_sensitivity_list = []
model_dict = {}
model_index = 0

skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(X_train_val, y_train_val)

for train_index, val_index in skf.split(X_train_val, y_train_val):
    X_train, X_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_train, y_val = y_train_val.iloc[train_index].to_numpy().flatten(), y_train_val.iloc[val_index].to_numpy().flatten()

    model_dict[model_index] = XGBClassifier(seed=0)
    model_dict[model_index].fit(X_train, y_train)

    y_predict = model_dict[model_index].predict_proba(X_val)
    y_predict = y_predict[:, 1]
    y_predict_class = list(map(round, y_predict))

    test_acc = accuracy_score(y_val, y_predict_class)
    test_rocauc = roc_auc_score(y_val, y_predict)
    test_bal_acc = balanced_accuracy_score(y_val, y_predict_class)
    test_precision = precision_score(y_val, y_predict_class)  # tp/(tp+fp)
    test_mcc = matthews_corrcoef(y_val, y_predict_class)
    tn, fp, fn, tp = confusion_matrix(y_val, y_predict_class).ravel()

    # Adding the metrics to their list
    cv_acc_list.append(test_acc)
    cv_ba_acc_list.append(test_bal_acc)
    cv_rocauc_list.append(test_rocauc)
    cv_precision_list.append(test_precision)
    cv_mcc_list.append(test_mcc)
    cv_specificity_list.append(tn / (tn + fp))
    cv_sensitivity_list.append(tp / (fn + tp))


In [22]:
print("------------------------------------------------------------------------------------------")
print("Stratified Cross-Validation Performance")
print("------------------------------------------------------------------------------------------")
print("Accuracy: %s \nAUCROC: %s \nMCC: %s \nSensitivity: %s \nSpecificity: %s \nBalanced Accuracy: %s" % (
        statistics.mean(cv_acc_list), statistics.mean(cv_rocauc_list), statistics.mean(cv_mcc_list),
        statistics.mean(cv_sensitivity_list), statistics.mean(cv_specificity_list), statistics.mean(cv_ba_acc_list)))

print("------------------------------------------------------------------------------------------")
print("Accuracy SD: %s \nAUCROC SD: %s \nMCC SD: %s \nSensitivity SD: %s \nSpecificity SD: %s \nBalanced Accuracy SD: %s" % (
        np.std(cv_acc_list), np.std(cv_rocauc_list), np.std(cv_mcc_list),
        np.std(cv_sensitivity_list), np.std(cv_specificity_list), np.std(cv_ba_acc_list)))

------------------------------------------------------------------------------------------
Stratified Cross-Validation Performance
------------------------------------------------------------------------------------------
Accuracy: 0.9794127460114275 
AUCROC: 0.9861793416873176 
MCC: 0.6936564251291136 
Sensitivity: 0.6549796371714179 
Specificity: 0.9919297095409255 
Balanced Accuracy: 0.8234546733561717
------------------------------------------------------------------------------------------
Accuracy SD: 0.0012908643694495424 
AUCROC SD: 0.004219446474463532 
MCC SD: 0.030085841959887706 
Sensitivity SD: 0.06382722390444089 
Specificity SD: 0.0013495382790118756 
Balanced Accuracy SD: 0.031288343637778465


In [23]:
evaluate_feature_importance(md=model_dict, df_evaluate_feature_importance=df_annot_test_X_2.copy())

              Property  Average_importance
2           prev_annot            0.290045
17              GT..AG            0.217229
12         rel_int_sup            0.100396
9           opp_strand            0.059161
3    transcript_source            0.043975
6         ss_antisense            0.043145
16  intron_sources_RAC            0.036725
0                score            0.025843
5       repeat_overlap            0.025381
18              GC..AG            0.017314
13       rel_int_sup_k            0.016479
8      incorrect_locus            0.016199
11            bbiotype            0.015847
24           T_content            0.015350
1               length            0.014990
15  intron_sources_CLS            0.014969
22           C_content            0.014412
21           A_content            0.013081
23           G_content            0.010738
4       intron_sources            0.004777
19              AT..AC            0.003947
14  intron_sources_SLR            0.000000
7          

We can see that the introduction of this feature is not useful (the performance gone worse, although insignificant amount of decrease) for the performance for this prediction task, which is also indicative in feature importance analysis. <br/>
However, since we have already linked the coordinates to the original sequence, we can perform more analysis / feature engineering using the sequence.

# 3. Test the cross-validation performance of this dataset adding feature
# TODO: Generate features using repDNA, integrating it, and validating its performance

In [24]:
from repDNA.nac import RevcKmer
from repDNA.ac import DAC

In [25]:
ac = DAC(lag=6)

In [44]:
seq_lst = [x.replace("N", "") for x in list(df_annot_test_X_3.Sequence)]

In [33]:
#FIXME This has extremely long run time (>6 hs), Should find ways to parallel
pos_vec = ac.make_dac_vec(seq_lst, all_property=True)

In [45]:
column_names = ["pos_vec_" + str(i) for i in range(len(pos_vec[1]))]

# Create a separate dataframe that contains all info
df_pos_vec = pd.DataFrame(data=pos_vec, columns=column_names)
df_pos_vec.head(5)

Unnamed: 0,pos_vec_0,pos_vec_1,pos_vec_2,pos_vec_3,pos_vec_4,pos_vec_5,pos_vec_6,pos_vec_7,pos_vec_8,pos_vec_9,...,pos_vec_218,pos_vec_219,pos_vec_220,pos_vec_221,pos_vec_222,pos_vec_223,pos_vec_224,pos_vec_225,pos_vec_226,pos_vec_227
0,0.023,-0.197,-0.079,0.416,-0.116,0.279,0.27,0.106,0.024,0.351,...,0.045,0.02,0.009,0.072,0.023,0.032,0.04,0.036,0.04,0.015
1,0.036,-0.184,-0.088,0.46,-0.107,0.322,0.308,0.13,0.028,0.385,...,0.057,0.033,0.023,0.093,-0.025,-0.014,0.033,-0.003,-0.012,-0.017
2,0.001,-0.206,-0.112,0.429,-0.126,0.27,0.287,0.128,0.02,0.362,...,0.067,0.036,0.021,0.101,0.006,-0.006,0.062,0.003,0.022,0.01
3,-0.039,-0.18,-0.155,0.416,-0.13,0.278,0.314,0.234,0.038,0.381,...,0.075,0.055,0.044,0.108,0.011,0.011,0.067,0.013,0.03,0.03
4,0.05,-0.184,-0.138,0.441,-0.082,0.283,0.285,0.092,0.049,0.347,...,0.057,0.037,0.027,0.08,0.012,0.019,0.093,0.046,0.005,0.007


In [None]:
#TODO merge df_pos_vec to the main df (df with all the features)