# SPLAM

In [2]:
import pandas as pd

In [2]:
# Read the TSV file into a pandas DataFrame
df = pd.read_csv("data/04_train_set.tsv", sep='\t')

# Generate a new 'junction' column
df['junction'] = ['JUNC{:08d}'.format(i+1) for i in range(df.shape[0])]

# Reorder and rename columns to match the desired BED format
df = df[['chr', 'start', 'end', 'junction', 'class', 'strand']]

# Write the DataFrame to a BED file, without the header and index
df.to_csv("data/splam/splam_introns.bed", sep='\t', header=False, index=False)

In [4]:
df

Unnamed: 0,chr,start,end,junction,class,strand
0,chr1,12227,12612,JUNC00000001,1,+
1,chr1,12721,13220,JUNC00000002,1,+
2,chr1,12057,12178,JUNC00000003,1,+
3,chr1,12697,12974,JUNC00000004,1,+
4,chr1,13052,13220,JUNC00000005,1,+
...,...,...,...,...,...,...
519029,chrY,25464577,25465486,JUNC00519030,0,+
519030,chrY,25513173,25513588,JUNC00519031,0,-
519031,chrY,25513745,25516715,JUNC00519032,0,-
519032,chrY,25525288,25527646,JUNC00519033,0,-


## We then run the SPLAM score, see below

<pre>
Command:
!splam score -G data/human_ref_hg38_109/GRCh38.primary_assembly.genome.fa -m data/splam/splam_script.pt data/splam/splam_introns.bed

=====================================================================
 An accurate spliced alignment pruner and splice junction predictor.
=====================================================================


  ███████╗██████╗ ██╗      █████╗ ███╗   ███╗
  ██╔════╝██╔══██╗██║     ██╔══██╗████╗ ████║
  ███████╗██████╔╝██║     ███████║██╔████╔██║
  ╚════██║██╔═══╝ ██║     ██╔══██║██║╚██╔╝██║
  ███████║██║     ███████╗██║  ██║██║ ╚═╝ ██║
  ╚══════╝╚═╝     ╚══════╝╚═╝  ╚═╝╚═╝     ╚═╝

[Info] Chromosomes in the annotation file is in 'chr*' style
[Info] Running model in "cuda" mode
[Info] Loading model ... (data/splam/splam_script.pt)
model = torch.load(model_path)!!
[Info] Done loading model
[Info] Loading data ...
         50000  junctions loaded.
         100000  junctions loaded.
         150000  junctions loaded.
         200000  junctions loaded.
         250000  junctions loaded.
         300000  junctions loaded.
         350000  junctions loaded.
         400000  junctions loaded.
         450000  junctions loaded.
         500000  junctions loaded.
         519020  junctions loaded.
[Info] Done loading data
[Info] SPLAM!  |################################| 51902/51902
</pre>

## We then move the output junction_score.bed at root/tmp_out to data/splam and deleted the rest

In [3]:
df_splam_output = pd.read_csv("data/splam/junction_score.bed", sep='\t')
df_splam_output

Unnamed: 0,chr1,12227,12612,JUNC00000001,1,+,0.99992514,0.9998584,Unnamed: 8
0,chr1,12721,13220,JUNC00000002,1,+,0.999545,0.999214,
1,chr1,12057,12178,JUNC00000003,1,+,0.100124,0.533779,
2,chr1,12697,12974,JUNC00000004,1,+,0.996771,0.984649,
3,chr1,13052,13220,JUNC00000005,1,+,0.225408,0.238977,
4,chr1,13374,13452,JUNC00000006,1,+,0.004987,0.007289,
...,...,...,...,...,...,...,...,...,...
519014,chrY,25464577,25465486,JUNC00519030,0,+,0.981751,0.991057,
519015,chrY,25513173,25513588,JUNC00519031,0,-,0.999749,0.999595,
519016,chrY,25513745,25516715,JUNC00519032,0,-,0.072688,0.052010,
519017,chrY,25525288,25527646,JUNC00519033,0,-,0.998390,0.994151,


In [4]:
df_splam_output = df_splam_output.drop(columns="Unnamed: 8")
df_splam_output

Unnamed: 0,chr1,12227,12612,JUNC00000001,1,+,0.99992514,0.9998584
0,chr1,12721,13220,JUNC00000002,1,+,0.999545,0.999214
1,chr1,12057,12178,JUNC00000003,1,+,0.100124,0.533779
2,chr1,12697,12974,JUNC00000004,1,+,0.996771,0.984649
3,chr1,13052,13220,JUNC00000005,1,+,0.225408,0.238977
4,chr1,13374,13452,JUNC00000006,1,+,0.004987,0.007289
...,...,...,...,...,...,...,...,...
519014,chrY,25464577,25465486,JUNC00519030,0,+,0.981751,0.991057
519015,chrY,25513173,25513588,JUNC00519031,0,-,0.999749,0.999595
519016,chrY,25513745,25516715,JUNC00519032,0,-,0.072688,0.052010
519017,chrY,25525288,25527646,JUNC00519033,0,-,0.998390,0.994151


In [6]:
df_splam_output.to_csv("data/11_trainset_splam_feature.tsv", sep="\t")