In [1]:
import pandas as pd

In [2]:
df_accepted = pd.read_csv("data/1-rmv_dup_introns_gencode_v44.tsv", sep="\t")
df_accepted["class"] = 1
df_accepted

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
402449,chrY,57211569,57211760,+,1
402450,chrY,57213125,57213203,-,1
402451,chrY,57213357,57213525,-,1
402452,chrY,57213602,57213879,-,1


In [3]:
df_rejected = pd.read_csv("data/og_rejected_introns", sep="\t")
df_rejected["class"] = 0
df_rejected

Unnamed: 0,chr,start,end,strand,Duplicate_Count,class
0,chr1,732207,739802,-,1,0
1,chr1,6634784,6634990,-,1,0
2,chr1,9036720,9088663,+,1,0
3,chr1,9088686,9100902,+,1,0
4,chr1,11277018,11277484,-,1,0
...,...,...,...,...,...,...
398,chrX,151403679,151404937,+,1,0
399,chrX,151409210,151456968,+,1,0
400,chrX,153906577,153906694,-,1,0
401,chrX,153906409,153906520,-,1,0


In [4]:
df_merged = pd.concat([df_accepted, df_rejected])
df_merged = df_merged.drop(columns="Duplicate_Count")
df_merged

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
398,chrX,151403679,151404937,+,0
399,chrX,151409210,151456968,+,0
400,chrX,153906577,153906694,-,0
401,chrX,153906409,153906520,-,0


# Self-simulated false introns

The set of simulated false introns is generated using:
1. Simulate false read using pbsim3
a. ONT single pass `pbsim --strategy templ --method qshmm --qshmm ~/Desktop/pbsim3-3.0.0/data/QSHMM-ONT.model --template ~/Desktop/pbsim3-3.0.0/gencode.v43.pc_transcripts.fa --prefix pbsim_rna_long --difference-ratio 39:24:36`
b. ONT multi-pass `pbsim --strategy templ --method qshmm --qshmm ~/Desktop/pbsim3-3.0.0/data/QSHMM-ONT.model --template ~/Desktop/pbsim3-3.0.0/gencode.v43.pc_transcripts.fa --prefix pbsim_rna_long_mp_3 --difference-ratio 39:24:36 --pass-num 3`
c. PacBio - Iso-seq `pbsim --strategy templ --method qshmm --qshmm ~/Desktop/pbsim3-3.0.0/data/QSHMM-RSII.model --template /home/rabbit/Desktop/pbsim3-3.0.0/gencode.v43.pc_transcripts.fa --prefix pbsim_rna_long_mp_3 --difference-ratio 22:45:33 --accuracy-mean 99.9 -pass-num 3`

2. Read-alignment using Minimap2
a. `minimap2 -ax splice -t 8 -uf -k14 ~/Documents/Projects/ML_gene_annot/release-109-hg38/hg38.fa ~/Documents/Projects/ML_gene_annot/pbsim3/pbsim_rna_long.fastq > aln.sam`
b. `samtools fastq ~/Documents/Projects/ML_gene_annot/pbsim3/ONT_real/pbsim_rna_long_mp_3.sam > simulated_ont_3pass.fq`  then  `minimap2 -ax splice -t 8 -u f -k 14 ~/Documents/Projects/ML_gene_annot/release-109-hg38/hg38.fa ~/Documents/Projects/ML_gene_annot/minimap2/ONT_real/simulated_ont_3pass.fq > aln2.sam`
c. `samtools fastq ~/Documents/Projects/ML_gene_annot/pbsim3/PacBio/pbsim_pacbio_rna_long_mp_3.sam > simulated_pacbio_3pass.fq` then `minimap2 -ax splice:hq -uf ~/Documents/Projects/ML_gene_annot/release-109-hg38/hg38.fa ~/Documents/Projects/ML_gene_annot/minimap2/PacBio/simulated_pacbio_3pass.fq > aln_pacbio.sam`

3. Sorted and convert to BAM
samtools sort ~/Documents/Projects/ML_gene_annot/minimap2/alignment.sam > aln_sorted.bam

4. Indexing
samtools index aln_sorted.bam

5. Get introns.ipynb

In [7]:
df_simulated_introns1 = pd.read_csv("./data/false_introns_simulated/introns.bed", sep="\t", names=["chr", "start", "end", "features", "score", "strand"])
df_simulated_introns1 = df_simulated_introns1.loc[:, ["chr", "start", "end", "strand"]]
df_simulated_introns1["class"] = 0
df_simulated_introns1 = df_simulated_introns1.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_simulated_introns1

Unnamed: 0,chr,start,end,strand,class
0,chr1,14829,14969,-,0
1,chr1,15038,15795,-,0
2,chr1,15947,16606,-,0
3,chr1,16765,16853,-,0
4,chr1,17055,17232,-,0
...,...,...,...,...,...
1016591,chrY,57209733,57209821,+,0
1016592,chrY,57209980,57210639,+,0
1016593,chrY,57210792,57211557,+,0
1016594,chrY,57211620,57211760,+,0


In [8]:
df_simulated_introns2 = pd.read_csv("./data/false_introns_simulated/introns2.bed", sep="\t", names=["chr", "start", "end", "features", "score", "strand"])
df_simulated_introns2 = df_simulated_introns2.loc[:, ["chr", "start", "end", "strand"]]
df_simulated_introns2["class"] = 0
df_simulated_introns2 = df_simulated_introns2.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_simulated_introns2

Unnamed: 0,chr,start,end,strand,class
0,chr1,14829,14969,-,0
1,chr1,15038,15795,-,0
2,chr1,15947,16606,-,0
3,chr1,16765,16853,-,0
4,chr1,17055,17232,-,0
...,...,...,...,...,...
3050930,chrY,57209733,57209821,+,0
3050931,chrY,57209980,57210639,+,0
3050932,chrY,57210792,57211551,+,0
3050933,chrY,57211620,57211760,+,0


In [9]:
df_simulated_introns3 = pd.read_csv("./data/false_introns_simulated/introns_pacbio_1.bed", sep="\t", names=["chr", "start", "end", "features", "score", "strand"])
df_simulated_introns3 = df_simulated_introns3.loc[:, ["chr", "start", "end", "strand"]]
df_simulated_introns3["class"] = 0
df_simulated_introns3 = df_simulated_introns3.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_simulated_introns3

Unnamed: 0,chr,start,end,strand,class
0,chr1,14829,14969,-,0
1,chr1,15038,15795,-,0
2,chr1,15947,16606,-,0
3,chr1,16765,16853,-,0
4,chr1,17055,17232,-,0
...,...,...,...,...,...
3160469,chrY,57209354,57209531,+,0
3160470,chrY,57209733,57209821,+,0
3160471,chrY,57209980,57210639,+,0
3160472,chrY,57210792,57211551,+,0


In [10]:
df_simulated_introns4 = pd.read_csv("./data/false_introns_simulated/introns_pacbio_2.bed", sep="\t", names=["chr", "start", "end", "features", "score", "strand"])
df_simulated_introns4 = df_simulated_introns4.loc[:, ["chr", "start", "end", "strand"]]
df_simulated_introns4["class"] = 0
df_simulated_introns4 = df_simulated_introns4.drop_duplicates(subset=["chr", "start", "end", "strand"])
df_simulated_introns4

Unnamed: 0,chr,start,end,strand,class
0,chr1,14829,14969,-,0
1,chr1,15038,15795,-,0
2,chr1,15947,16606,-,0
3,chr1,16765,16853,-,0
4,chr1,17055,17232,-,0
...,...,...,...,...,...
3138466,chrY,57209354,57209531,+,0
3138467,chrY,57209733,57209821,+,0
3138468,chrY,57209980,57210639,+,0
3138469,chrY,57210792,57211551,+,0


In [11]:
merged_df = pd.concat([df_merged, df_simulated_introns2, df_simulated_introns3, df_simulated_introns4]).drop_duplicates(subset=["chr", "start", "end", "strand"], keep='first')
sum(merged_df["class"] == 0)

125464

In [12]:
assert(len(merged_df[merged_df.start == 15038]) == 1)
assert(merged_df[merged_df.start == 15038].iloc[0]["class"] == 1)

 Now we have 125,464 false introns!

In [13]:
merged_df.to_csv("data/2.1-merged_train_set.tsv", sep="\t", index=False)
merged_df

Unnamed: 0,chr,start,end,strand,class
0,chr1,12227,12612,+,1
1,chr1,12721,13220,+,1
2,chr1,12057,12178,+,1
3,chr1,12697,12974,+,1
4,chr1,13052,13220,+,1
...,...,...,...,...,...
3137656,chrY,24883840,24886132,+,0
3137841,chrY,24888605,24889352,+,0
3137842,chrY,24889386,24901111,+,0
3137884,chrY,24833970,24840730,+,0
