# Abstract / Conclusion

|                      | Accuracy  |
|----------------------|-----------|
| ION (Standard mode)  | **0.913** |
| ION (Strict   mode)  | 0.149     |
| SPLAM<sup>1</sup>    | 0.762     |

We can see that overall ION (Standard) mode performs the best in all category except for the False Positive Rate (FPR), the Strict mode of ION performs the best in terms of FPR but sacrifices significantly the overall predictive performance of the model.

In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
from pyfaidx import Fasta
from Bio.Seq import Seq

In [2]:
warnings.filterwarnings("ignore")

In [3]:
df = pd.read_csv("benchmarking/gencode_v46_new_introns.tsv", sep="\t")
df

Unnamed: 0,chr,start,end,strand
0,chr1,23367168,23369365,-
1,chr1,23441845,23442185,-
2,chr1,23452730,23455880,-
3,chr1,23441217,23441386,-
4,chr1,23456194,23484471,-
...,...,...,...,...
2269,chrY,57211569,57211760,+
2270,chrY,57213125,57213203,-
2271,chrY,57213357,57213525,-
2272,chrY,57213602,57213879,-


In [4]:
df["u1"] = 0
df["class"] = 1
df["strand"] = df.pop("strand")
df

Unnamed: 0,chr,start,end,u1,class,strand
0,chr1,23367168,23369365,0,1,-
1,chr1,23441845,23442185,0,1,-
2,chr1,23452730,23455880,0,1,-
3,chr1,23441217,23441386,0,1,-
4,chr1,23456194,23484471,0,1,-
...,...,...,...,...,...,...
2269,chrY,57211569,57211760,0,1,+
2270,chrY,57213125,57213203,0,1,-
2271,chrY,57213357,57213525,0,1,-
2272,chrY,57213602,57213879,0,1,-


In [5]:
# This is downloaded in https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/
genome = Fasta('data/human_ref_hg38_109/GRCh38.primary_assembly.genome.fa', sequence_always_upper=True)

# A simple lambda function for matching the chromosome, start and end
coords_to_dna = lambda start_c, end_c, chr_c: genome[chr_c][start_c - 1:end_c]

def find_sequence_maxentscan(row):
	chromosome = row["chr"]
	start = row["start"]
	stop = row["end"]
	strand = row["strand"]

	seq_find_sequence = str(coords_to_dna(int(start) + 1 - 3, int(stop) + 3, chromosome))

	if strand == "-":  #strand
		seq_find_sequence = Seq(seq_find_sequence)  # Encode the sequence into
		seq_find_sequence = seq_find_sequence.reverse_complement()

	seq_find_sequence = str(seq_find_sequence)
	middle_part = seq_find_sequence[3:-3].upper()

	return middle_part


df["sequence"] = df.apply(find_sequence_maxentscan, axis=1)
df

Unnamed: 0,chr,start,end,u1,class,strand,sequence
0,chr1,23367168,23369365,0,1,-,GTAAGTCTGTTCGTCCCCCACTCGGGTTTTCGGCTTCCTACCTTCA...
1,chr1,23441845,23442185,0,1,-,GTAGGTATCTTCCCGCGTCCCTGCCACTAACCCTTCCAGTGTCACA...
2,chr1,23452730,23455880,0,1,-,GTGAGGCCCCTGTCTCTTCCTTGCCCAGACTCAGGGTAAATCTAGT...
3,chr1,23441217,23441386,0,1,-,GTCAGCCCCTGAACTATCCCAAAAGGAATGCCCTCCCCCAAGGCCC...
4,chr1,23456194,23484471,0,1,-,GTGGGGCATCGCGCCAGGGGGCGGGGTCTGAGGCGGTGAGGGGCTG...
...,...,...,...,...,...,...,...
2269,chrY,57211569,57211760,0,1,+,GTGGGCACTTGATGTCGGATCTCTTCAACAAGCTGGTCATGAGGCG...
2270,chrY,57213125,57213203,0,1,-,GTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAG...
2271,chrY,57213357,57213525,0,1,-,GCAAGCCTGGCTGCCTCCAGCTGGGTGGACAGACAGGGGCTGGAGA...
2272,chrY,57213602,57213879,0,1,-,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...


In [6]:
df["splice-site"] = df["sequence"].str[:2] + ":" + df["sequence"].str[-2:]
df

Unnamed: 0,chr,start,end,u1,class,strand,sequence,splice-site
0,chr1,23367168,23369365,0,1,-,GTAAGTCTGTTCGTCCCCCACTCGGGTTTTCGGCTTCCTACCTTCA...,GT:AG
1,chr1,23441845,23442185,0,1,-,GTAGGTATCTTCCCGCGTCCCTGCCACTAACCCTTCCAGTGTCACA...,GT:AG
2,chr1,23452730,23455880,0,1,-,GTGAGGCCCCTGTCTCTTCCTTGCCCAGACTCAGGGTAAATCTAGT...,GT:AG
3,chr1,23441217,23441386,0,1,-,GTCAGCCCCTGAACTATCCCAAAAGGAATGCCCTCCCCCAAGGCCC...,GT:AG
4,chr1,23456194,23484471,0,1,-,GTGGGGCATCGCGCCAGGGGGCGGGGTCTGAGGCGGTGAGGGGCTG...,GT:AG
...,...,...,...,...,...,...,...,...
2269,chrY,57211569,57211760,0,1,+,GTGGGCACTTGATGTCGGATCTCTTCAACAAGCTGGTCATGAGGCG...,GT:AG
2270,chrY,57213125,57213203,0,1,-,GTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAG...,GT:AG
2271,chrY,57213357,57213525,0,1,-,GCAAGCCTGGCTGCCTCCAGCTGGGTGGACAGACAGGGGCTGGAGA...,GC:AG
2272,chrY,57213602,57213879,0,1,-,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,GT:AG


In [7]:
df = df[(df["splice-site"] == "GT:AG") | (df["splice-site"] == "GC:AG") | (df["splice-site"] == "AT:AC")]
df

Unnamed: 0,chr,start,end,u1,class,strand,sequence,splice-site
0,chr1,23367168,23369365,0,1,-,GTAAGTCTGTTCGTCCCCCACTCGGGTTTTCGGCTTCCTACCTTCA...,GT:AG
1,chr1,23441845,23442185,0,1,-,GTAGGTATCTTCCCGCGTCCCTGCCACTAACCCTTCCAGTGTCACA...,GT:AG
2,chr1,23452730,23455880,0,1,-,GTGAGGCCCCTGTCTCTTCCTTGCCCAGACTCAGGGTAAATCTAGT...,GT:AG
3,chr1,23441217,23441386,0,1,-,GTCAGCCCCTGAACTATCCCAAAAGGAATGCCCTCCCCCAAGGCCC...,GT:AG
4,chr1,23456194,23484471,0,1,-,GTGGGGCATCGCGCCAGGGGGCGGGGTCTGAGGCGGTGAGGGGCTG...,GT:AG
...,...,...,...,...,...,...,...,...
2269,chrY,57211569,57211760,0,1,+,GTGGGCACTTGATGTCGGATCTCTTCAACAAGCTGGTCATGAGGCG...,GT:AG
2270,chrY,57213125,57213203,0,1,-,GTTCACTCCTGCCTTTTCCTTTCCCTAGAGCCTCCACCACCCCGAG...,GT:AG
2271,chrY,57213357,57213525,0,1,-,GCAAGCCTGGCTGCCTCCAGCTGGGTGGACAGACAGGGGCTGGAGA...,GC:AG
2272,chrY,57213602,57213879,0,1,-,GTGAGTGTCCCCAGTGTTGCAGAGGTGAGAGGAGAGTAGACAGTGA...,GT:AG


In [8]:
df = df.drop(columns=["splice-site", "sequence"])
df.to_csv("benchmarking/gencode_v46_new_introns.bed", sep="\t", header=False, index=False)

# Benchmarking using Splam: a deep-learning-based splice site predictor that improves spliced alignments
bioRxiv. 2023 Jul 29;2023.07.27.550754. doi: 10.1101/2023.07.27.550754. Preprint

In [9]:
!splam score -G ./data/human_ref_hg38_109/GRCh38.primary_assembly.genome.fa -m ./data/splam/splam_script.pt -o splam_out benchmarking/gencode_v46_new_introns.bed

 An accurate spliced alignment pruner and splice junction predictor. 


  ███████╗██████╗ ██╗      █████╗ ███╗   ███╗
  ██╔════╝██╔══██╗██║     ██╔══██╗████╗ ████║
  ███████╗██████╔╝██║     ███████║██╔████╔██║
  ╚════██║██╔═══╝ ██║     ██╔══██║██║╚██╔╝██║
  ███████║██║     ███████╗██║  ██║██║ ╚═╝ ██║
  ╚══════╝╚═╝     ╚══════╝╚═╝  ╚═╝╚═╝     ╚═╝
    
[Info] Chromosomes in the annotation file is in 'chr*' style
[Info] Running model in "cuda" mode
[Info] Loading model ... (./data/splam/splam_script.pt)
model = torch.load(model_path)!!
[Info] Done loading model
[Info] Loading data ...
	 2208  junctions loaded.
[Info] Done loading data
[Info] SPLAM!  |################################| 221/221
[?25h

In [18]:
df_splam_pred = pd.read_csv("benchmarking/splam_out_gencode_v46/junction_score.bed", sep="\t", names=["chr","start","end",'rc3_score','class','strand','donor_score','acceptor_score','u']).drop(columns="u")
df_splam_pred

Unnamed: 0,chr,start,end,rc3_score,class,strand,donor_score,acceptor_score
0,chr1,23367168,23369365,0,1,-,1.000000,1.000000
1,chr1,23441845,23442185,0,1,-,0.999741,0.999877
2,chr1,23452730,23455880,0,1,-,1.000000,1.000000
3,chr1,23441217,23441386,0,1,-,0.999990,0.999966
4,chr1,23456194,23484471,0,1,-,0.999905,0.999833
...,...,...,...,...,...,...,...,...
2203,chrY,57211569,57211760,0,1,+,0.998500,0.999796
2204,chrY,57213125,57213203,0,1,-,0.004676,0.004959
2205,chrY,57213357,57213525,0,1,-,0.958597,0.946810
2206,chrY,57213602,57213879,0,1,-,0.993605,0.986091


In [19]:
df_splam_pred["splam_pred"] = ((df_splam_pred['donor_score'] >= 0.5) & (df_splam_pred['acceptor_score'] >= 0.5)).astype(int)
df_splam_pred

Unnamed: 0,chr,start,end,rc3_score,class,strand,donor_score,acceptor_score,splam_pred
0,chr1,23367168,23369365,0,1,-,1.000000,1.000000,1
1,chr1,23441845,23442185,0,1,-,0.999741,0.999877,1
2,chr1,23452730,23455880,0,1,-,1.000000,1.000000,1
3,chr1,23441217,23441386,0,1,-,0.999990,0.999966,1
4,chr1,23456194,23484471,0,1,-,0.999905,0.999833,1
...,...,...,...,...,...,...,...,...,...
2203,chrY,57211569,57211760,0,1,+,0.998500,0.999796,1
2204,chrY,57213125,57213203,0,1,-,0.004676,0.004959,0
2205,chrY,57213357,57213525,0,1,-,0.958597,0.946810,1
2206,chrY,57213602,57213879,0,1,-,0.993605,0.986091,1


In [20]:
df_splam_pred["splam_pred_avg"] = df_splam_pred[['donor_score', 'acceptor_score']].min(axis=1)
df_splam_pred

Unnamed: 0,chr,start,end,rc3_score,class,strand,donor_score,acceptor_score,splam_pred,splam_pred_avg
0,chr1,23367168,23369365,0,1,-,1.000000,1.000000,1,1.000000
1,chr1,23441845,23442185,0,1,-,0.999741,0.999877,1,0.999741
2,chr1,23452730,23455880,0,1,-,1.000000,1.000000,1,1.000000
3,chr1,23441217,23441386,0,1,-,0.999990,0.999966,1,0.999966
4,chr1,23456194,23484471,0,1,-,0.999905,0.999833,1,0.999833
...,...,...,...,...,...,...,...,...,...,...
2203,chrY,57211569,57211760,0,1,+,0.998500,0.999796,1,0.998500
2204,chrY,57213125,57213203,0,1,-,0.004676,0.004959,0,0.004676
2205,chrY,57213357,57213525,0,1,-,0.958597,0.946810,1,0.946810
2206,chrY,57213602,57213879,0,1,-,0.993605,0.986091,1,0.986091


In [21]:
acc = accuracy_score(df_splam_pred["class"], df_splam_pred["splam_pred"])

print("Metrics for SPLAM")
print("Accuracy: ", acc)

Metrics for SPLAM
Accuracy:  0.7721920289855072


# ION (High-Recall Mode)

In [22]:
df_ion1 = pd.read_csv("benchmarking/ION_out_gencode_v46/output_high_recall_mode.bed", sep="\t")
df_ion1["prediction_class"] = (df_ion1["prediction"] >= 0.5).astype(int)
df_ion1["class"] = df_ion1["u2"]
acc = accuracy_score(df_ion1["class"], df_ion1["prediction_class"])

print("Metrics for ION (standard mode)")
print("Accuracy: ", acc)

Metrics for ION (standard mode)
Accuracy:  0.8442028985507246


# ION (Standard Mode)

In [23]:
df_ion2 = pd.read_csv("benchmarking/ION_out_gencode_v46/output_standard_mode.bed", sep="\t")
df_ion2["prediction_class"] = (df_ion2["prediction"] >= 0.5).astype(int)
df_ion2["class"] = df_ion2["u2"]
acc = accuracy_score(df_ion2["class"], df_ion2["prediction_class"])

print("Metrics for ION (standard mode)")
print("Accuracy: ", acc)

Metrics for ION (standard mode)
Accuracy:  0.7078804347826086


# ION (High-Precision Mode)

In [25]:
df_ion3 = pd.read_csv("benchmarking/ION_out_gencode_v46/output_high_precision_mode.bed", sep="\t")
df_ion3["prediction_class"] = (df_ion3["prediction"] >= 0.5).astype(int)
df_ion3["class"] = df_ion3["u2"]
acc = accuracy_score(df_ion3["class"], df_ion3["prediction_class"])

print("Metrics for ION (standard mode)")
print("Accuracy: ", acc)

Metrics for ION (standard mode)
Accuracy:  0.6019021739130435
