In [1]:
import pandas as pd

In [2]:
df_accepted = pd.read_csv("data/1-rmv_dup_introns_gencode_v44.tsv", sep="\t")
df_accepted["class"] = 1
df_accepted

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
402449,chrY,57211569,57211760,+,1
402450,chrY,57213125,57213203,-,1
402451,chrY,57213357,57213525,-,1
402452,chrY,57213602,57213879,-,1


In [3]:
df_rejected = pd.read_csv("data/og_rejected_introns", sep="\t")
df_rejected["class"] = 0
df_rejected

Unnamed: 0,chr,start,end,strand,Duplicate_Count,class
0,chr1,732207,739802,-,1,0
1,chr1,6634784,6634990,-,1,0
2,chr1,9036720,9088663,+,1,0
3,chr1,9088686,9100902,+,1,0
4,chr1,11277018,11277484,-,1,0
...,...,...,...,...,...,...
398,chrX,151403679,151404937,+,1,0
399,chrX,151409210,151456968,+,1,0
400,chrX,153906577,153906694,-,1,0
401,chrX,153906409,153906520,-,1,0


In [4]:
df_merged = pd.concat([df_accepted, df_rejected])
df_merged

Unnamed: 0,chr,start,end,strand,class,Duplicate_Count
0,chr1,12227,12612,+,1,
1,chr1,12721,13220,+,1,
2,chr1,12057,12178,+,1,
3,chr1,12697,12974,+,1,
4,chr1,13052,13220,+,1,
...,...,...,...,...,...,...
398,chrX,151403679,151404937,+,0,1.0
399,chrX,151409210,151456968,+,0,1.0
400,chrX,153906577,153906694,-,0,1.0
401,chrX,153906409,153906520,-,0,1.0


In [5]:
df_merged = df_merged.drop(columns="Duplicate_Count")
df_merged

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
398,chrX,151403679,151404937,+,0
399,chrX,151409210,151456968,+,0
400,chrX,153906577,153906694,-,0
401,chrX,153906409,153906520,-,0


# Self-simulated false introns

The set of simulated false introns is generated using:
1. Simulate false read using pbsim3
pbsim --strategy templ --method qshmm --qshmm /home/rabbit/Desktop/pbsim3-3.0.0/data/QSHMM-ONT.model --template /home/rabbit/Desktop/pbsim3-3.0.0/gencode.v43.pc_transcripts.fa --prefix pbsim_rna_long --difference-ratio 39:24:36

2. Read-alignment using Minimap2 (ONT setting)
minimap2 -ax splice -t 8 -uf -k14 /home/rabbit/Documents/Projects/ML_gene_annot/release-109-hg38/hg38.fa /home/rabbit/Documents/Projects/ML_gene_annot/pbsim3/pbsim_rna_long.fastq > aln.sam

3. Sorted and convert to BAM
samtools sort /home/rabbit/Documents/Projects/ML_gene_annot/minimap2/alignment.sam > /home/rabbit/Documents/Projects/ML_gene_annot/minimap2/aln_sorted.bam

4. Indexing
samtools index aln_sorted.bam

5. Get introns.ipynb

In [6]:
df_simulated_introns = pd.read_csv("introns.bed", sep="\t", names=["chr", "start", "end", "features", "score", "strand"])
df_simulated_introns = df_simulated_introns.loc[:, ["chr", "start", "end", "strand"]]
df_simulated_introns["class"] = 0
df_simulated_introns = df_simulated_introns.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_simulated_introns

Unnamed: 0,chr,start,end,strand,class
0,chr1,14829,14969,-,0
1,chr1,15038,15795,-,0
2,chr1,15947,16606,-,0
3,chr1,16765,16853,-,0
4,chr1,17055,17232,-,0
...,...,...,...,...,...
1016591,chrY,57209733,57209821,+,0
1016592,chrY,57209980,57210639,+,0
1016593,chrY,57210792,57211557,+,0
1016594,chrY,57211620,57211760,+,0


In [7]:
merged_df = pd.concat([df_merged, df_simulated_introns]).drop_duplicates(subset=["chr", "start", "end", "strand"], keep='first')
merged_df

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
1016500,chrY,25986748,25987810,+,0
1016508,chrY,26335719,26337372,-,0
1016509,chrY,26337521,26354287,-,0
1016510,chrY,56954656,56960285,+,0


In [8]:
assert(len(merged_df[merged_df.start == 15038]) == 1)
assert(merged_df[merged_df.start == 15038].iloc[0]["class"] == 1)

Now we have 47113 false introns!

In [9]:
sum(merged_df["class"] == 0)

47113

In [10]:
merged_df.to_csv("data/2-merged_train_set", sep="\t", index=False)