In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, precision_score, matthews_corrcoef, confusion_matrix
from xgboost import XGBClassifier

In [6]:
df_featurized = pd.read_csv("Featurized_gencode_introns")
df_featurized

Unnamed: 0,chrom,intron_start,intron_ends,strand,Duplicate_Count,Repeat_overlaps,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score
0,chr1,12227,12612,+,2,[],1122:3199,583:1460,9374:54492,GT:AG,59151
1,chr1,12721,13220,+,1,[],1791:3198,783:1104,14048:56719,GT:AG,61021
2,chr1,12057,12178,+,1,[],,,,,-1
3,chr1,12697,12974,+,1,[],1:1,1:1,15:19,GT:AG,21
4,chr1,13052,13220,+,1,[],22:24,17:18,433:484,GC:AG,526
...,...,...,...,...,...,...,...,...,...,...,...
517017,chrY,15955,57212183,-,1,"['Centromere', 'Dust', 'Low complexity regions...",,,,,-1
517018,chrY,57213125,57213203,-,1,[],,,,,-1
517019,chrY,57213357,57213525,-,1,[],,,,,-1
517020,chrY,57213602,57213879,-,1,[],,,,,-1


In [7]:
df_original_test = pd.read_csv("ftp-data/gene_annot_test.tsv", sep="\t")
df_original_test

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,rej_reason,annot_match,incorrect_locus,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k
0,chr1:261635-267302:-1,accepted,888,5668,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,,yes,no,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,0.311999,0.311999
1,chr1:259026-261549:-1,accepted,650,2524,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,,yes,no,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,-0.311999,-0.311999
2,chr1:732208-739802:-1,rejected,0,7595,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.708050
3,chr1:720201-732016:-1,accepted,0,11816,yes,PacBio Capture-seq,CLS,GT..AG,Type II Transposons,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.931194
4,chr1:711923-720031:-1,accepted,27,8109,yes,PacBio Capture-seq,CLS,GT..AG,No overlap,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,1.216395,0.810930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,chrY:20582694-20584473:1,accepted,278936,1780,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,-0.141738,-0.364851
11006,chrY:20584525-20588023:1,accepted,286043,3499,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,-0.112146,-0.335259
11007,chrY:20588106-20589483:1,accepted,444721,1378,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,0.433606,0.210496
11008,chrY:20589576-20592340:1,accepted,468983,2765,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,0.503702,0.280593


In [10]:
df_original_test_rejected = df_original_test[df_original_test.outcome == "rejected"]

In [None]:
drop_idx_lst = []
append_dict = {}
for index, row in df_original_test_rejected.iterrows():
	coords_split = row.coords.split(":")
	chromosome = coords_split[0]
	start = int(coords_split[1].split("-")[0])
	stop = int(coords_split[1].split("-")[1])
	strand = "+" if coords_split[2] == 1 else "-"

	append_dict


	if not df_featurized[(df_featurized.strand == strand) & (df_featurized.intron_start == start-1) & (df_featurized.intron_ends == stop) & (df_featurized.chrom == chromosome)].empty:
		# print(df_featurized[(df_featurized.strand == strand) & (df_featurized.intron_start == start-1) & (df_featurized.intron_ends == stop) & (df_featurized.chrom == chromosome)])
		drop_idx_lst.append(index)

In [None]:
# TODO add which rows are dropped for further investigation

In [47]:
# We drop the introns that has been annotated in the latest version of the read
df_original_test_rejected = df_original_test_rejected.drop(index=drop_idx_lst)
df_original_test_rejected

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,rej_reason,annot_match,incorrect_locus,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k
2,chr1:732208-739802:-1,rejected,0,7595,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.708050
58,chr1:6634785-6634990:-1,rejected,0,206,no,SLRseq,SLR,GT..AG,No overlap,no,,no,no,no,no,OTTHUMT00000499635,protein_coding,coding,-13.164612,-13.164612
71,chr1:9036721-9088663:1,rejected,0,51943,no,SLRseq,SLR,GT..AG,Type I Transposons/SINE,yes,,no,no,no,no,OTTHUMT00000498962,comp_pipe,novel,,
72,chr1:9088687-9100902:1,rejected,0,12216,no,SLRseq,SLR,GC..AG,Type I Transposons/SINE,yes,,no,no,no,no,OTTHUMT00000498962,comp_pipe,novel,,
93,chr1:11277019-11277484:-1,rejected,0,466,no,PacBio RACEseq,RAC,GC..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500638,processed_pseudogene,non-coding,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10972,chrX:151403680-151404937:1,rejected,0,1258,no,SLRseq,SLR,GC..AG,No overlap,no,repeat,no,no,no,no,OTTHUMT00000498842,protein_coding,coding,-11.391108,-12.084255
10973,chrX:151409211-151456968:1,rejected,0,47758,no,SLRseq,SLR,GC..AG,Type I Transposons/SINE,no,repeat,no,no,no,no,OTTHUMT00000498842,protein_coding,coding,-11.391108,-12.084255
10977,chrX:153906578-153906694:-1,rejected,0,117,no,PacBio RACEseq,RAC,GC..AG,No overlap,yes,"repeat, opposite strand",no,no,no,no,OTTHUMT00000500681,comp_pipe,novel,,
10978,chrX:153906410-153906520:-1,rejected,0,111,no,PacBio RACEseq,RAC,GT..AG,No overlap,yes,"repeat, opposite strand",no,no,no,no,OTTHUMT00000500681,comp_pipe,novel,,


In [48]:
df_original_test_rejected

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,rej_reason,annot_match,incorrect_locus,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k
2,chr1:732208-739802:-1,rejected,0,7595,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.708050
58,chr1:6634785-6634990:-1,rejected,0,206,no,SLRseq,SLR,GT..AG,No overlap,no,,no,no,no,no,OTTHUMT00000499635,protein_coding,coding,-13.164612,-13.164612
71,chr1:9036721-9088663:1,rejected,0,51943,no,SLRseq,SLR,GT..AG,Type I Transposons/SINE,yes,,no,no,no,no,OTTHUMT00000498962,comp_pipe,novel,,
72,chr1:9088687-9100902:1,rejected,0,12216,no,SLRseq,SLR,GC..AG,Type I Transposons/SINE,yes,,no,no,no,no,OTTHUMT00000498962,comp_pipe,novel,,
93,chr1:11277019-11277484:-1,rejected,0,466,no,PacBio RACEseq,RAC,GC..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500638,processed_pseudogene,non-coding,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10972,chrX:151403680-151404937:1,rejected,0,1258,no,SLRseq,SLR,GC..AG,No overlap,no,repeat,no,no,no,no,OTTHUMT00000498842,protein_coding,coding,-11.391108,-12.084255
10973,chrX:151409211-151456968:1,rejected,0,47758,no,SLRseq,SLR,GC..AG,Type I Transposons/SINE,no,repeat,no,no,no,no,OTTHUMT00000498842,protein_coding,coding,-11.391108,-12.084255
10977,chrX:153906578-153906694:-1,rejected,0,117,no,PacBio RACEseq,RAC,GC..AG,No overlap,yes,"repeat, opposite strand",no,no,no,no,OTTHUMT00000500681,comp_pipe,novel,,
10978,chrX:153906410-153906520:-1,rejected,0,111,no,PacBio RACEseq,RAC,GT..AG,No overlap,yes,"repeat, opposite strand",no,no,no,no,OTTHUMT00000500681,comp_pipe,novel,,


In [49]:
df_featurized

Unnamed: 0,chrom,intron_start,intron_ends,strand,Duplicate_Count,Repeat_overlaps,GTExv2,TCGAv2,SRAv3h,RC3-Splice_site,RC3-Score
0,chr1,12227,12612,+,2,[],1122:3199,583:1460,9374:54492,GT:AG,59151
1,chr1,12721,13220,+,1,[],1791:3198,783:1104,14048:56719,GT:AG,61021
2,chr1,12057,12178,+,1,[],,,,,-1
3,chr1,12697,12974,+,1,[],1:1,1:1,15:19,GT:AG,21
4,chr1,13052,13220,+,1,[],22:24,17:18,433:484,GC:AG,526
...,...,...,...,...,...,...,...,...,...,...,...
517017,chrY,15955,57212183,-,1,"['Centromere', 'Dust', 'Low complexity regions...",,,,,-1
517018,chrY,57213125,57213203,-,1,[],,,,,-1
517019,chrY,57213357,57213525,-,1,[],,,,,-1
517020,chrY,57213602,57213879,-,1,[],,,,,-1
