# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Construct the mRNA sequence
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA

In [1]:
#| default_exp training.transcription.generation

In [2]:
#| hide
from nbdev.showdoc import *

## 0. Setup

In [3]:
#| export
from pathlib import Path
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import random
import warnings
import time
from datetime import timedelta
from tqdm import tqdm

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.data.transcription import get_chromosome_genes
from llm_mito_scanner.training.transcription.index import get_intron_locations

warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

BOS_TOK = "<bos>"
EOS_TOK = "<eos>"
INTRON_TOK = "<intron>"
UNK_TOK = "<unk>"
NULL_TOK = "<null>"
PAD_TOK = "<pad>"
MRNA_BOS_TOK = "<mrna-bos>"
MRNA_EOS_TOK = "<mrna-eos>"

In [4]:
#| hide
config = load_config()

In [5]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genes_path = latest_assembly_path / "genes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [genes_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

In [6]:
#| hide
example_chromosome = "NC_000001.11"

In [7]:
#| hide
genes = get_chromosome_genes(latest_assembly_path, chromosome=example_chromosome)
genes.head()

Unnamed: 0,geneid,sequence,pos_strand_position,neg_strand_position
0,GeneID:100287102,CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTA...,11873,14409
1,GeneID:653635,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...,14361,29370
2,GeneID:102466751,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...,17368,17436
3,GeneID:107985730,TGCCCTCCAGCCCTACGCCTTGACCCGCTTTCCTGCGTCTCTCAGC...,29773,35418
4,GeneID:100302278,GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCG...,30365,30503


In [8]:
#| hide
intron_locations = get_intron_locations(intron_locations_path, chromosome=example_chromosome)
intron_locations.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [9]:
#| hide
intron_locations[intron_locations.mrna_start != 0].head(10)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
12,GeneID:81399,XM_017002410.2,82,22713,8309,44026,NC_000001.11
13,GeneID:81399,XM_017002408.2,82,20998,8309,44026,NC_000001.11
14,GeneID:81399,XM_017002408.2,21216,22713,8309,44026,NC_000001.11
15,GeneID:81399,XM_047431162.1,19,9866,21156,44026,NC_000001.11
42,GeneID:148398,NM_152486.4,70,191,1808,20652,NC_000001.11
43,GeneID:148398,NM_152486.4,283,4424,1808,20652,NC_000001.11
44,GeneID:148398,NM_152486.4,4606,5308,1808,20652,NC_000001.11
45,GeneID:148398,NM_152486.4,5359,10041,1808,20652,NC_000001.11
46,GeneID:148398,NM_152486.4,10166,13309,1808,20652,NC_000001.11
47,GeneID:148398,NM_152486.4,13399,13544,1808,20652,NC_000001.11


In [10]:
#| hide
prev_intron_end = intron_locations.groupby(['chromosome', 'geneid', 'transcriptid']).intron_end.shift(1)
intron_space = pd.concat([intron_locations, prev_intron_end], axis=1).dropna()
intron_space.columns = intron_locations.columns.tolist() + ['prev_intron_end']
intron_space.loc[:, 'intron_space'] = (intron_space.intron_start - intron_space.prev_intron_end).astype(int)
intron_space.sort_values('intron_space', ascending=True, inplace=True)
intron_space.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,prev_intron_end,intron_space
166725,GeneID:10000,XM_047415643.1,305471,324663,836,325632,NC_000001.11,305469.0,2
129489,GeneID:9910,XM_047436058.1,680,17669,634643,835789,NC_000001.11,677.0,3
131438,GeneID:6646,XM_047428828.1,2441,8430,21210,64884,NC_000001.11,2437.0,4
141102,GeneID:7135,NM_003281.4,4615,6476,0,17947,NC_000001.11,4611.0,4
131540,GeneID:126859,XM_011509167.4,167005,167824,266,189031,NC_000001.11,166999.0,6


In [11]:
#| hide
intron_space.intron_space.describe()

count    155915.000000
mean        153.005388
std         198.402928
min           2.000000
25%          89.000000
50%         125.000000
75%         171.000000
max       14864.000000
Name: intron_space, dtype: float64

In [12]:
#| hide
example_intron = intron_locations.iloc[42, :]
example_chromosome = example_intron.chromosome
example_gene = example_intron.geneid
example_chromosome, example_gene

('NC_000001.11', 'GeneID:148398')

## 1. mRNA Sequence Generation

In [13]:
#| export
def get_gene(genes: pd.DataFrame, geneid: str) -> str:
    return genes.set_index('geneid').loc[geneid].sequence

In [14]:
#| hide
example_gene_sequence = get_gene(genes, example_gene)
len(example_gene_sequence)

20652

In [15]:
#| export
def get_mrna_intron_locations(
        chromosome: str, gene_id: str, transcript_id: str, 
        intron_locations: pd.DataFrame = None, intron_locations_path: Path = None
        ) -> list[tuple[int, int]]:
    if intron_locations is None:
        intron_locations = get_intron_locations(intron_locations_path)
    mrna_intron_locations = intron_locations[
        (intron_locations.chromosome == chromosome) &
        (intron_locations.geneid == gene_id) &
        (intron_locations.transcriptid == transcript_id)
    ]
    if mrna_intron_locations.shape[0] == 0:
        return []
    mrna_intron_locations_list = list(map(tuple, mrna_intron_locations[['intron_start', 'intron_end']].values.tolist()))
    return mrna_intron_locations_list

In [16]:
#| hide
example_mrna_intron_locations = get_mrna_intron_locations(
    example_chromosome, example_gene, example_intron.transcriptid,
    intron_locations=intron_locations
)
print(len(example_mrna_intron_locations))
example_mrna_intron_locations

13


[(70, 191),
 (283, 4424),
 (4606, 5308),
 (5359, 10041),
 (10166, 13309),
 (13399, 13544),
 (13730, 15413),
 (15576, 16405),
 (16521, 16679),
 (16758, 16828),
 (17328, 17522),
 (17647, 17967),
 (18078, 18177)]

In [17]:
#| export
def get_mrna_from_gene(
        gene_sequence: str, 
        mrna_start: int, mrna_end: int, 
        intron_locations: list[tuple[int, int]],
        intron_token: str = INTRON_TOK,
        untranscribed_token: str = NULL_TOK,
        debug: bool = False) -> tuple[list[str], list[str]]:
    "Get annotated input and target sequences for a given mRNA."
    gene_sequence_length = len(gene_sequence)
    start_pad_len = mrna_start
    end_pad_len = gene_sequence_length - mrna_end
    if debug:
        print("MRNA START:\t", mrna_start)
        print("MRNA END:\t", mrna_end)
        print("GENE SEQUENCE LENGTH:\t", gene_sequence_length)
        print("START PAD LENGTH:\t", start_pad_len)
        print("END PAD LENGTH:\t", end_pad_len)
    mrna = list(gene_sequence[mrna_start: mrna_end + 1])
    for intron_start, intron_end in intron_locations:
        pre_intron = mrna[:intron_start]
        post_intron = mrna[intron_end:]
        intron_length = intron_end - intron_start
        intron = [intron_token] * intron_length
        mrna = pre_intron + intron + post_intron
    if start_pad_len > 0:
        start_pad = ([untranscribed_token] * start_pad_len)
        mrna = start_pad + mrna
    if end_pad_len > 0:
        end_pad = ([untranscribed_token] * end_pad_len)
        mrna = mrna + end_pad
    gene = list(gene_sequence)
    mrna = [n if n != "T" else "U" for n in mrna]
    return gene, mrna

In [18]:
#| hide
example_mrna = intron_locations[['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']].drop_duplicates().head(10)
example_mrna

Unnamed: 0,chromosome,geneid,transcriptid,mrna_start,mrna_end
0,NC_000001.11,GeneID:79501,NM_001005484.2,0,6167
2,NC_000001.11,GeneID:112268260,XM_047436352.1,0,17102
6,NC_000001.11,GeneID:105378947,XM_011542538.1,0,25011
11,NC_000001.11,GeneID:81399,XM_024449992.2,0,44026
12,NC_000001.11,GeneID:81399,XM_017002410.2,8309,44026
13,NC_000001.11,GeneID:81399,XM_017002408.2,8309,44026
15,NC_000001.11,GeneID:81399,XM_047431162.1,21156,44026
16,NC_000001.11,GeneID:148398,NM_001385640.1,0,20652
29,NC_000001.11,GeneID:148398,NM_001385641.1,0,20652
42,NC_000001.11,GeneID:148398,NM_152486.4,1808,20652


In [19]:
#| hide
example_mrna_sequences = example_mrna.apply(lambda row: get_mrna_from_gene(
    genes[(genes.geneid == row.geneid)].iloc[0, :].sequence,
    row.mrna_start, row.mrna_end,
    get_mrna_intron_locations(
        row.chromosome, row.geneid, row.transcriptid, 
        intron_locations[intron_locations.chromosome==row.chromosome])), 
    axis=1)

In [20]:
#| hide
pd.DataFrame(example_mrna_sequences.values.tolist(), columns=['gene', 'mrna'])

Unnamed: 0,gene,mrna
0,"[C, C, C, A, G, A, T, C, T, C, T, T, C, A, G, ...","[C, C, C, A, G, A, U, C, U, C, U, U, C, A, G, ..."
1,"[A, T, G, C, C, T, A, G, A, C, A, C, A, C, A, ...","[A, U, G, C, C, U, A, G, A, C, A, C, A, C, A, ..."
2,"[A, T, G, C, G, T, A, G, A, C, A, C, A, C, A, ...","[A, U, G, C, G, U, A, G, A, C, A, C, A, C, A, ..."
3,"[T, A, T, A, A, A, A, T, G, A, A, A, G, C, T, ...","[U, A, U, A, A, A, A, U, G, A, A, A, G, C, U, ..."
4,"[T, A, T, A, A, A, A, T, G, A, A, A, G, C, T, ...","[<null>, <null>, <null>, <null>, <null>, <null..."
5,"[T, A, T, A, A, A, A, T, G, A, A, A, G, C, T, ...","[<null>, <null>, <null>, <null>, <null>, <null..."
6,"[T, A, T, A, A, A, A, T, G, A, A, A, G, C, T, ...","[<null>, <null>, <null>, <null>, <null>, <null..."
7,"[G, G, C, G, G, C, G, G, A, G, T, C, T, C, C, ...","[G, G, C, G, G, C, G, G, A, G, U, C, U, C, C, ..."
8,"[G, G, C, G, G, C, G, G, A, G, T, C, T, C, C, ...","[G, G, C, G, G, C, G, G, A, G, U, C, U, C, C, ..."
9,"[G, G, C, G, G, C, G, G, A, G, T, C, T, C, C, ...","[<null>, <null>, <null>, <null>, <null>, <null..."


In [21]:
#| hide
example_annotated_gene, example_annotated_mrna = get_mrna_from_gene(
    example_gene_sequence,
    example_intron.mrna_start, example_intron.mrna_end,
    example_mrna_intron_locations,
    debug=True
)
print(example_intron.mrna_end - example_intron.mrna_start)
len(example_annotated_gene), len(example_annotated_mrna)

MRNA START:	 1808
MRNA END:	 20652
GENE SEQUENCE LENGTH:	 20652
START PAD LENGTH:	 1808
END PAD LENGTH:	 0
18844


(20652, 20652)

In [22]:
#| hide
type(example_annotated_gene), example_annotated_gene.count("T"), example_annotated_gene.count("U"), \
    example_annotated_gene.count("<intron>"), example_annotated_gene[0:5], example_annotated_gene[-5:]

(list, 3539, 0, 0, ['G', 'G', 'C', 'G', 'G'], ['G', 'C', 'C', 'T', 'G'])

In [23]:
#| hide
type(example_annotated_mrna), example_annotated_mrna.count("T"), example_annotated_mrna.count("U"), \
    example_annotated_mrna.count("<intron>"), example_annotated_mrna[0:5], example_annotated_mrna[-5:]

(list,
 0,
 385,
 16287,
 ['<null>', '<null>', '<null>', '<null>', '<null>'],
 ['G', 'C', 'C', 'U', 'G'])

In [25]:
#| export
def get_mrna_locations(locations: pd.DataFrame) -> pd.DataFrame:
    "Get the mrna sequences between introns"
    # Get locations of transcribed dna
    locations = locations.copy()
    locations_grouped = locations.groupby(
            ['chromosome', 'geneid', 'transcriptid']
    )
    mrna_seq_start = (
        locations_grouped.intron_end.shift(1).fillna(0)
    ).astype(int)
    mrna_seq_start.name = "start"
    mrna_seq_end = locations.intron_start
    mrna_seq_end.name = "end"
    selected_cols = ['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']
    mrna_sequences = pd.concat(
        [
            locations[selected_cols],
            mrna_seq_start,
            mrna_seq_end
        ], axis=1)
    # Add last sequence of mRNA from last intron
    last_intron = locations_grouped.tail(1)
    last_intron.loc[:, 'start'] = last_intron.intron_end
    last_intron.loc[:, 'end'] = last_intron.mrna_end
    all_mrna_sequences = pd.concat(
        [
            mrna_sequences,
            last_intron[selected_cols + ['start', 'end']]
        ], axis=0, ignore_index=True
    ).sort_values(['chromosome', 'geneid', 'transcriptid', 'start']).reset_index(drop=True)
    return all_mrna_sequences

In [26]:
#| hide
mrna_locations = get_mrna_locations(intron_locations)
mrna_locations.head()

Unnamed: 0,chromosome,geneid,transcriptid,mrna_start,mrna_end,start,end
0,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,0,41
1,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,302,460
2,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,147868,147994
3,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,178701,178813
4,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,197547,197692


In [27]:
#| hide
mrna_locations.shape

(181355, 7)

In [28]:
#| hide
first_sample_mrna = mrna_locations.iloc[0, :]
intron_locations[
    (intron_locations.chromosome == first_sample_mrna.chromosome) &
    (intron_locations.geneid == first_sample_mrna.geneid) &
    (intron_locations.transcriptid == first_sample_mrna.transcriptid)
].head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
166694,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11
166695,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11
166696,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11
166697,GeneID:10000,NM_001206729.2,178813,197547,7495,351356,NC_000001.11
166698,GeneID:10000,NM_001206729.2,197692,205842,7495,351356,NC_000001.11


In [1]:
#| hide
import nbdev; nbdev.nbdev_export()