# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Construct the mRNA sequence
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA

In [1]:
#| default_exp training.transcription.generation

## 0. Setup

In [236]:
#| export
from pathlib import Path
import pandas as pd

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.training.transcription.index import get_intron_locations

BOS_TOK = "<bos>"
EOS_TOK = "<eos>"
INTRON_TOK = "<intron>"
NULL_TOK = "<null>"
MRNA_BOS_TOK = "<mrna-bos>"
MRNA_EOS_TOK = "<mrna-eos>"

In [3]:
#| hide
config = load_config()

In [4]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genes_path = latest_assembly_path / "genes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [genes_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

In [5]:
#| hide
intron_locations = get_intron_locations(intron_locations_path)
intron_locations.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [250]:
#| hide
intron_locations[intron_locations.mrna_start != 0].head(10)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
12,GeneID:81399,XM_017002410.2,82,22713,8309,44026,NC_000001.11
13,GeneID:81399,XM_017002408.2,82,20998,8309,44026,NC_000001.11
14,GeneID:81399,XM_017002408.2,21216,22713,8309,44026,NC_000001.11
15,GeneID:81399,XM_047431162.1,19,9866,21156,44026,NC_000001.11
42,GeneID:148398,NM_152486.4,70,191,1808,20652,NC_000001.11
43,GeneID:148398,NM_152486.4,283,4424,1808,20652,NC_000001.11
44,GeneID:148398,NM_152486.4,4606,5308,1808,20652,NC_000001.11
45,GeneID:148398,NM_152486.4,5359,10041,1808,20652,NC_000001.11
46,GeneID:148398,NM_152486.4,10166,13309,1808,20652,NC_000001.11
47,GeneID:148398,NM_152486.4,13399,13544,1808,20652,NC_000001.11


In [181]:
#| hide
prev_intron_end = intron_locations.groupby(['chromosome', 'geneid', 'transcriptid']).intron_end.shift(1)
intron_space = pd.concat([intron_locations, prev_intron_end], axis=1).dropna()
intron_space.columns = intron_locations.columns.tolist() + ['prev_intron_end']
intron_space.loc[:, 'intron_space'] = (intron_space.intron_start - intron_space.prev_intron_end).astype(int)
intron_space.sort_values('intron_space', ascending=True, inplace=True)
intron_space.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,prev_intron_end,intron_space
1073192,GeneID:3845,XM_047428826.1,38752,41018,0,45684,NC_000012.12,38751.0,1
581729,GeneID:2870,NM_002082.4,14571,15061,1153,17100,NC_000005.10,14569.0,2
610529,GeneID:23506,XM_024446390.2,1127,19730,44237,122218,NC_000006.12,1125.0,2
1192146,GeneID:55012,XM_024449638.2,1472,5561,0,36827,NC_000014.9,1470.0,2
1699082,GeneID:55217,XM_047442236.1,3580,11733,76587,123942,NC_000023.11,3578.0,2


In [182]:
#| hide
intron_space.intron_space.describe()

count    1.574709e+06
mean     1.568139e+02
std      3.197923e+02
min      1.000000e+00
25%      8.800000e+01
50%      1.220000e+02
75%      1.660000e+02
max      8.761600e+04
Name: intron_space, dtype: float64

In [251]:
#| hide
example_intron = intron_locations.iloc[42, :]
example_chromosome = example_intron.chromosome
example_gene = example_intron.geneid
example_chromosome, example_gene

('NC_000001.11', 'GeneID:148398')

## 1. mRNA Sequence Generation

In [252]:
#| export
def get_gene_from_csv(chromosome_genes: Path, geneid: str) -> pd.Series:
    with chromosome_genes.open("r") as f:
        header = next(f).replace("\n", "").split(",")
        for row in f:
            if row.startswith(geneid):
                row_series = pd.Series(row.replace("\n", "").split(","))
                row_series.index = header
                return row_series

In [253]:
#| hide
example_gene_series = get_gene_from_csv(genes_path / f"{example_chromosome}.csv", example_gene)
example_gene_series

geneid                                                     GeneID:148398
sequence               GGCGGCGGAGTCTCCCAAGTCCCCGCCGGGCGGGCGCGCGCCAGTG...
pos_strand_position                                               923922
neg_strand_position                                               944574
dtype: object

In [254]:
#| hide
len(example_gene_series.sequence)

20652

In [255]:
#| export
def get_mrna_intron_locations(
        chromosome: str, gene_id: str, transcript_id: str, 
        intron_locations: pd.DataFrame = None, intron_locations_path: Path = None
        ) -> list[tuple[int, int]]:
    if intron_locations is None:
        intron_locations = get_intron_locations(intron_locations_path)
    mrna_intron_locations = intron_locations[
        (intron_locations.chromosome == chromosome) &
        (intron_locations.geneid == gene_id) &
        (intron_locations.transcriptid == transcript_id)
    ]
    if mrna_intron_locations.shape[0] == 0:
        return []
    mrna_intron_locations_list = list(map(tuple, mrna_intron_locations[['intron_start', 'intron_end']].values.tolist()))
    return mrna_intron_locations_list

In [256]:
#| hide
example_mrna_intron_locations = get_mrna_intron_locations(
    example_chromosome, example_gene, example_intron.transcriptid,
    intron_locations=intron_locations
)
print(len(example_mrna_intron_locations))
example_mrna_intron_locations

13


[(70, 191),
 (283, 4424),
 (4606, 5308),
 (5359, 10041),
 (10166, 13309),
 (13399, 13544),
 (13730, 15413),
 (15576, 16405),
 (16521, 16679),
 (16758, 16828),
 (17328, 17522),
 (17647, 17967),
 (18078, 18177)]

In [269]:
#| export
def get_mrna_from_gene(
        gene_sequence: str, 
        mrna_start: int, mrna_end: int, 
        intron_locations: list[tuple[int, int]],
        bos_token: str = BOS_TOK,
        eos_token: str = EOS_TOK,
        intron_token: str = INTRON_TOK,
        mrna_bos_token: str = MRNA_BOS_TOK,
        mrna_eos_token: str = MRNA_EOS_TOK,
        pad_token: str = NULL_TOK,
        debug: bool = False) -> tuple[str, str]:
    gene_sequence_length = len(gene_sequence)
    start_pad_len = mrna_start - 1
    end_pad_len = gene_sequence_length - mrna_end - 1
    if debug:
        print("MRNA START:\t", mrna_start)
        print("MRNA END:\t", mrna_end)
        print("GENE SEQUENCE LENGTH:\t", gene_sequence_length)
        print("START PAD LENGTH:\t", start_pad_len)
        print("END PAD LENGTH:\t", end_pad_len)
    mrna = [mrna_bos_token] + list(gene_sequence[mrna_start: mrna_end + 1]) + [mrna_eos_token]
    for intron_start, intron_end in intron_locations:
        pre_intron = mrna[:intron_start]
        post_intron = mrna[intron_end:]
        intron_length = intron_end - intron_start
        intron = [intron_token] * intron_length
        mrna = pre_intron + intron + post_intron
    if start_pad_len > 0:
        start_pad = [bos_token] + ([pad_token] * start_pad_len)
        mrna = start_pad + mrna
    if end_pad_len > 0:
        end_pad = ([pad_token] * end_pad_len) + [eos_token]
        mrna = mrna + end_pad
    gene = [bos_token] + list(gene_sequence) + [eos_token]
    mrna = [n if n != "T" else "U" for n in mrna]
    return gene, mrna

In [270]:
#| hide
example_annotated_gene, example_annotated_mrna = get_mrna_from_gene(
    example_gene_series.sequence,
    example_intron.mrna_start, example_intron.mrna_end,
    example_mrna_intron_locations,
    debug=True
)
print(example_intron.mrna_end - example_intron.mrna_start)
len(example_annotated_gene), len(example_annotated_mrna)

MRNA START:	 1808
MRNA END:	 20652
GENE SEQUENCE LENGTH:	 20652
START PAD LENGTH:	 1807
END PAD LENGTH:	 -1
18844


(20654, 20654)

In [273]:
#| hide
type(example_annotated_gene), example_annotated_gene.count("T"), example_annotated_gene.count("U"), \
    example_annotated_gene.count("<intron>"), example_annotated_gene[0:5], example_annotated_gene[-5:]

(list,
 3539,
 0,
 0,
 ['<bos>', 'G', 'G', 'C', 'G'],
 ['C', 'C', 'T', 'G', '<eos>'])

In [271]:
#| hide
type(example_annotated_mrna), example_annotated_mrna.count("T"), example_annotated_mrna.count("U"), \
    example_annotated_mrna.count("<intron>"), example_annotated_mrna[0:5], example_annotated_mrna[-5:]

(list,
 0,
 385,
 16287,
 ['<bos>', '<null>', '<null>', '<null>', '<null>'],
 ['C', 'C', 'U', 'G', '<mrna-eos>'])

## 2. Generate Training Instances

When we train, we'll want to;
- load training instances easily
- ensure training instances are the same length
- ensure the training instances are a sample of the dataset
- ensure the sample is representative of the dataset


In [276]:
#| hide
intron_locations.transcriptid.unique().size

129045

In [277]:
#| hide
import nbdev; nbdev.nbdev_export()