In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, balanced_accuracy_score, precision_score, matthews_corrcoef, confusion_matrix
from xgboost import XGBClassifier

In [2]:
df_featurized_gencode = pd.read_csv("data/1-rmv_dup_introns_gencode_v44.tsv", sep="\t")
df_featurized_gencode

Unnamed: 0,chr,start,end,strand,Duplicate_Count
0,chr1,12227,12612,+,2
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
517017,chrY,15955,57212183,-,1
517018,chrY,57213125,57213203,-,1
517019,chrY,57213357,57213525,-,1
517020,chrY,57213602,57213879,-,1


In [3]:
# Read the original tsv file uploaded into the GSoC ML project repo to extract the rejected introns
df_original_test = pd.read_csv("ftp-data/gene_annot_test.tsv", sep="\t")
df_original_test

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,rej_reason,annot_match,incorrect_locus,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k
0,chr1:261635-267302:-1,accepted,888,5668,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,,yes,no,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,0.311999,0.311999
1,chr1:259026-261549:-1,accepted,650,2524,yes,SLRseq,"SLR,CLS",GT..AG,Type I Transposons/LINE,no,,yes,no,no,no,OTTHUMT00000499557,transcribed_processed_pseudogene,non-coding,-0.311999,-0.311999
2,chr1:732208-739802:-1,rejected,0,7595,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.708050
3,chr1:720201-732016:-1,accepted,0,11816,yes,PacBio Capture-seq,CLS,GT..AG,Type II Transposons,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,-2.525729,-2.931194
4,chr1:711923-720031:-1,accepted,27,8109,yes,PacBio Capture-seq,CLS,GT..AG,No overlap,no,,no,no,no,no,OTTHUMT00000500170,transcribed_processed_pseudogene,non-coding,1.216395,0.810930
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11005,chrY:20582694-20584473:1,accepted,278936,1780,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,-0.141738,-0.364851
11006,chrY:20584525-20588023:1,accepted,286043,3499,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,-0.112146,-0.335259
11007,chrY:20588106-20589483:1,accepted,444721,1378,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,0.433606,0.210496
11008,chrY:20589576-20592340:1,accepted,468983,2765,yes,PacBio Capture-seq,"SLR,CLS,RAC",GT..AG,No overlap,no,seq quality,yes,no,no,no,OTTHUMT00000500440,protein_coding,coding,0.503702,0.280593


In [4]:
df_original_test_rejected = df_original_test[df_original_test.outcome == "rejected"]

In [5]:
drop_idx_lst = []
append_lst = []
for index, row in df_original_test_rejected.iterrows():
	coords_split = row.coords.split(":")
	chromosome = coords_split[0]
	start = int(coords_split[1].split("-")[0])-1
	end = int(coords_split[1].split("-")[1])
	strand = "+" if str(coords_split[2]) == "1" else "-"
	if df_featurized_gencode[(df_featurized_gencode.strand == strand) & (df_featurized_gencode.start == start) & (df_featurized_gencode.end == end) & (df_featurized_gencode.chr == chromosome)].empty:
		append_dict = {
			"chr": chromosome,
			"start": start,
			"end": end,
			"strand": strand,
			"Duplicate_Count": 1
		}
		append_lst.append(append_dict)
	else:
		# print(df_featurized_gencode[(df_featurized_gencode.strand == strand) & (df_featurized_gencode.start == start-1) & (df_featurized_gencode.end == end) & (df_featurized_gencode.chr == chromosome)])
		drop_idx_lst.append(index)


## These are the entries that were rejected from the original dict but accepted in the latest version

In [6]:
df_original_test_rejected.loc[drop_idx_lst]

Unnamed: 0,coords,outcome,score,length,prev_annot,transcript_source,intron_sources,splice_site,repeat_overlap,ss_antisense,rej_reason,annot_match,incorrect_locus,opp_strand,false_ret_int,transcript_id,gtype,bbiotype,rel_int_sup,rel_int_sup_k
882,chr10:72275560-72275899:1,rejected,82,340,no,SLRseq,SLR,GC..AG,No overlap,yes,,no,no,no,no,OTTHUMT00000499204,protein_coding,coding,-10.298652,-10.298652
1285,chr11:26567140-26569473:-1,rejected,76,2334,no,PacBio Capture-seq,CLS,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000499991,protein_coding,coding,-5.898009,-5.898009
2623,chr15:61861626-61890833:1,rejected,0,29208,no,SLRseq,SLR,GT..AG,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000499064,comp_pipe,novel,,
2624,chr15:61890863-61894121:1,rejected,0,3259,no,SLRseq,SLR,AT..AC,Type I Transposons/SINE,no,,no,no,no,no,OTTHUMT00000499064,comp_pipe,novel,,
5178,chr7:129925550-129937707:-1,rejected,0,12158,no,SLRseq,SLR,GT..AG,LTRs,no,,no,no,no,no,OTTHUMT00000499005,protein_coding,coding,-12.778861,-13.115329
8513,chr5:180835003-180835421:1,rejected,350,419,no,PacBio Capture-seq,CLS,GC..AG,LTRs,no,repeat,no,no,no,no,OTTHUMT00000500103,processed_transcript,non-coding,-2.77944,-2.77944


## We form the new Dataframe and output it to .tsv from the "still rejected" introns

In [10]:
df_output = pd.DataFrame.from_dict(append_lst)
df_output.to_csv("data/og_rejected_introns", sep="\t", index=False)

In [11]:
df_output

Unnamed: 0,chr,start,end,strand,Duplicate_Count
0,chr1,732208,739802,-,1
1,chr1,6634785,6634990,-,1
2,chr1,9036721,9088663,+,1
3,chr1,9088687,9100902,+,1
4,chr1,11277019,11277484,-,1
...,...,...,...,...,...
398,chrX,151403680,151404937,+,1
399,chrX,151409211,151456968,+,1
400,chrX,153906578,153906694,-,1
401,chrX,153906410,153906520,-,1
