# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Get the "edge" of the intron and;
    1. position the edge at the center, or forward or backward in a training string
    2. Wherever the "edge" is, replace the gene sequence with the INTRON token on the target string
2. Get the "edge" of the mRNA and;
    1. position the edge at the center, or forward or backward in a training string
    2. Wherever the "edge" is, replace the gene sequence with the NON-mRNA token on the target string
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA
    3. Include the "edge" of the mRNA with various shifting strategies

In [1]:
#| default_exp training.transcription.generation

## 0. Setup

In [6]:
#| export
from pathlib import Path
import pandas as pd

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.training.transcription.index import get_intron_locations

In [3]:
#| hide
config = load_config()

In [4]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genes_path = latest_assembly_path / "genes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [genes_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

In [5]:
#| hide
intron_locations = get_intron_locations(intron_locations_path)
intron_locations.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [28]:
#| hide
example_intron = intron_locations.iloc[0, :]
example_chromosome = example_intron.chromosome
example_gene = example_intron.geneid
example_chromosome, example_gene

('NC_000001.11', 'GeneID:79501')

## 1. Intron Edge Sequence Generation

In [24]:
#| export
def get_gene_from_csv(chromosome_genes: Path, geneid: str) -> pd.Series:
    with chromosome_genes.open("r") as f:
        header = next(f).replace("\n", "").split(",")
        for row in f:
            if row.startswith(geneid):
                row_series = pd.Series(row.replace("\n", "").split(","))
                row_series.index = header
                return row_series

In [29]:
#| hide
example_gene_series = get_gene_from_csv(genes_path / f"{example_chromosome}.csv", example_gene)
example_gene_series

geneid                                                      GeneID:79501
sequence               CCCAGATCTCTTCAGGTACATCTAGTCCATTCATAAAGGGCTTTTA...
pos_strand_position                                                65418
neg_strand_position                                                71585
dtype: object

In [35]:
#| export
def excise_gene_sequence(
        chromosome: str,
        genes_path: Path,
        gene_id: str,
        start_position: int,
        length: int = 64
        ) -> str:
    # Read gene from chromosome file
    # Get gene sequence from file
    gene_series = get_gene_from_csv(genes_path / f"{chromosome}.csv", gene_id)
    gene_sequence = gene_series.sequence
    # Excise
    seq_start = max(0, start_position)
    seq_end = min(len(gene_sequence), start_position + length)
    extracted_sequence = gene_sequence[seq_start: seq_end]
    return extracted_sequence

In [38]:
#| hide
example_intron_edge_left = excise_gene_sequence(
    example_chromosome,
    genes_path,
    example_gene,
    example_intron.intron_start, # offset by half the sequence size
    64
)
example_intron_edge_middle = excise_gene_sequence(
    example_chromosome,
    genes_path,
    example_gene,
    example_intron.intron_start - 32, # offset by half the sequence size
    64
)
example_intron_edge_right = excise_gene_sequence(
    example_chromosome,
    genes_path,
    example_gene,
    example_intron.intron_start - 64, # offset by half the sequence size
    64
)

len(example_intron_edge_left), len(example_intron_edge_middle), len(example_intron_edge_right)

(64, 47, 15)

## 2. mRNA Edge Sequence Generation

## 3. Generate Training Instances

In [10]:
#| hide
import nbdev; nbdev.nbdev_export()