# Getting our index of extracted mRNA, intron positions

> Indexing what we have.

In [2]:
#| default_exp training.transcription.index

In [3]:
#| hide
from nbdev.showdoc import *

In [4]:
#| export
from pathlib import Path
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path

random_state = 42

In [5]:
#| hide
config = load_config()

In [6]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
chromosomes_path = latest_assembly_path / "chromosomes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
if not intron_locations_path.exists():
    raise FileNotFoundError(f"This notebook requires the path {intron_locations_path.resolve()} to exist")

In [7]:
#| hide
intron_locations_files = list(intron_locations_path.glob("*.parquet"))
intron_locations_files[:3]

[Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/training/transcription/intron_positions/chromosome-NC_000001.11.parquet'),
 Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/training/transcription/intron_positions/chromosome-NC_000002.12.parquet'),
 Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/training/transcription/intron_positions/chromosome-NC_000003.12.parquet')]

In [8]:
#| hide
intron_location_frames = []
for f in intron_locations_files[:2]:
    f_frame = pd.read_parquet(f)
    if f_frame.shape[0] > 0:
        f_frame.loc[:, 'chromosome'] = f.stem.replace("chromosome-", "")
        intron_location_frames.append(f_frame)
intron_locations = pd.concat(intron_location_frames).reset_index(drop=True)
intron_locations.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [13]:
#| export
def get_intron_locations(directory: Path, chromosome: str = None) -> pd.DataFrame:
    # Index files
    chromosome_parquet_files = list(directory.glob("*.parquet"))
    if isinstance(chromosome, str):
        chromosome_parquet_files = [p for p in chromosome_parquet_files if p.stem.endswith(chromosome)]
    frames = []
    for f in chromosome_parquet_files:
        f_frame = pd.read_parquet(f)
        if f_frame.shape[0] > 0:
            f_frame.loc[:, 'chromosome'] = f.stem.replace("chromosome-", "")
            frames.append(f_frame)
    intron_locations = pd.concat(frames).reset_index(drop=True)
    return intron_locations

In [15]:
#| hide
get_intron_locations(intron_locations_path, "NC_000001.11").head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [16]:
#| hide
get_intron_locations(intron_locations_path).head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


With these locations we can do a few things;

1. Get the "edge" of the intron and;
    1. position the edge at the center, or forward or backward in a training string
    2. Wherever the "edge" is, replace the gene sequence with the INTRON token on the target string
2. Get the "edge" of the mRNA and;
    1. position the edge at the center, or forward or backward in a training string
    2. Wherever the "edge" is, replace the gene sequence with the NON-mRNA token on the target string
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA
    3. Include the "edge" of the mRNA with various shifting strategies

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()