# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Construct the mRNA sequence
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA

In [1]:
#| default_exp training.transcription.generation

## 0. Setup

In [2]:
#| export
from pathlib import Path
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import random
import warnings
import time
from datetime import timedelta

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.data.transcription import read_all_chromosome_gene_info
from llm_mito_scanner.training.transcription.index import get_intron_locations

warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

BOS_TOK = "<bos>"
EOS_TOK = "<eos>"
INTRON_TOK = "<intron>"
UNK_TOK = "<unk>"
NULL_TOK = "<null>"
PAD_TOK = "<pad>"
MRNA_BOS_TOK = "<mrna-bos>"
MRNA_EOS_TOK = "<mrna-eos>"



In [3]:
#| hide
config = load_config()

In [4]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genes_path = latest_assembly_path / "genes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [genes_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

In [5]:
#| hide
genes = read_all_chromosome_gene_info(latest_assembly_path)
genes.head()

Unnamed: 0,geneid,sequence,pos_strand_position,neg_strand_position
0,GeneID:100287102,CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTA...,11873,14409
1,GeneID:653635,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...,14361,29370
2,GeneID:102466751,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...,17368,17436
3,GeneID:107985730,TGCCCTCCAGCCCTACGCCTTGACCCGCTTTCCTGCGTCTCTCAGC...,29773,35418
4,GeneID:100302278,GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCG...,30365,30503


In [6]:
#| hide
intron_locations = get_intron_locations(intron_locations_path)
intron_locations.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [7]:
#| hide
intron_locations[intron_locations.mrna_start != 0].head(10)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
12,GeneID:81399,XM_017002410.2,82,22713,8309,44026,NC_000001.11
13,GeneID:81399,XM_017002408.2,82,20998,8309,44026,NC_000001.11
14,GeneID:81399,XM_017002408.2,21216,22713,8309,44026,NC_000001.11
15,GeneID:81399,XM_047431162.1,19,9866,21156,44026,NC_000001.11
42,GeneID:148398,NM_152486.4,70,191,1808,20652,NC_000001.11
43,GeneID:148398,NM_152486.4,283,4424,1808,20652,NC_000001.11
44,GeneID:148398,NM_152486.4,4606,5308,1808,20652,NC_000001.11
45,GeneID:148398,NM_152486.4,5359,10041,1808,20652,NC_000001.11
46,GeneID:148398,NM_152486.4,10166,13309,1808,20652,NC_000001.11
47,GeneID:148398,NM_152486.4,13399,13544,1808,20652,NC_000001.11


In [8]:
#| hide
prev_intron_end = intron_locations.groupby(['chromosome', 'geneid', 'transcriptid']).intron_end.shift(1)
intron_space = pd.concat([intron_locations, prev_intron_end], axis=1).dropna()
intron_space.columns = intron_locations.columns.tolist() + ['prev_intron_end']
intron_space.loc[:, 'intron_space'] = (intron_space.intron_start - intron_space.prev_intron_end).astype(int)
intron_space.sort_values('intron_space', ascending=True, inplace=True)
intron_space.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,prev_intron_end,intron_space
1073192,GeneID:3845,XM_047428826.1,38752,41018,0,45684,NC_000012.12,38751.0,1
581729,GeneID:2870,NM_002082.4,14571,15061,1153,17100,NC_000005.10,14569.0,2
610529,GeneID:23506,XM_024446390.2,1127,19730,44237,122218,NC_000006.12,1125.0,2
1192146,GeneID:55012,XM_024449638.2,1472,5561,0,36827,NC_000014.9,1470.0,2
1699082,GeneID:55217,XM_047442236.1,3580,11733,76587,123942,NC_000023.11,3578.0,2


In [9]:
#| hide
intron_space.intron_space.describe()

count    1.574709e+06
mean     1.568139e+02
std      3.197923e+02
min      1.000000e+00
25%      8.800000e+01
50%      1.220000e+02
75%      1.660000e+02
max      8.761600e+04
Name: intron_space, dtype: float64

In [10]:
#| hide
example_intron = intron_locations.iloc[42, :]
example_chromosome = example_intron.chromosome
example_gene = example_intron.geneid
example_chromosome, example_gene

('NC_000001.11', 'GeneID:148398')

## 1. mRNA Sequence Generation

In [11]:
#| export
def get_gene(genes: pd.DataFrame, geneid: str) -> str:
    return genes.set_index('geneid').loc[geneid].sequence

In [12]:
#| hide
example_gene_sequence = get_gene(genes, example_gene)
len(example_gene_sequence)

20652

In [13]:
#| export
def get_mrna_intron_locations(
        chromosome: str, gene_id: str, transcript_id: str, 
        intron_locations: pd.DataFrame = None, intron_locations_path: Path = None
        ) -> list[tuple[int, int]]:
    if intron_locations is None:
        intron_locations = get_intron_locations(intron_locations_path)
    mrna_intron_locations = intron_locations[
        (intron_locations.chromosome == chromosome) &
        (intron_locations.geneid == gene_id) &
        (intron_locations.transcriptid == transcript_id)
    ]
    if mrna_intron_locations.shape[0] == 0:
        return []
    mrna_intron_locations_list = list(map(tuple, mrna_intron_locations[['intron_start', 'intron_end']].values.tolist()))
    return mrna_intron_locations_list

In [14]:
#| hide
example_mrna_intron_locations = get_mrna_intron_locations(
    example_chromosome, example_gene, example_intron.transcriptid,
    intron_locations=intron_locations
)
print(len(example_mrna_intron_locations))
example_mrna_intron_locations

13


[(70, 191),
 (283, 4424),
 (4606, 5308),
 (5359, 10041),
 (10166, 13309),
 (13399, 13544),
 (13730, 15413),
 (15576, 16405),
 (16521, 16679),
 (16758, 16828),
 (17328, 17522),
 (17647, 17967),
 (18078, 18177)]

In [15]:
#| export
def get_mrna_from_gene(
        gene_sequence: str, 
        mrna_start: int, mrna_end: int, 
        intron_locations: list[tuple[int, int]],
        intron_token: str = INTRON_TOK,
        untranscribed_token: str = NULL_TOK,
        debug: bool = False) -> tuple[str, str]:
    "Get annotated input and target sequences for a given mRNA."
    gene_sequence_length = len(gene_sequence)
    start_pad_len = mrna_start
    end_pad_len = gene_sequence_length - mrna_end
    if debug:
        print("MRNA START:\t", mrna_start)
        print("MRNA END:\t", mrna_end)
        print("GENE SEQUENCE LENGTH:\t", gene_sequence_length)
        print("START PAD LENGTH:\t", start_pad_len)
        print("END PAD LENGTH:\t", end_pad_len)
    mrna = list(gene_sequence[mrna_start: mrna_end + 1])
    for intron_start, intron_end in intron_locations:
        pre_intron = mrna[:intron_start]
        post_intron = mrna[intron_end:]
        intron_length = intron_end - intron_start
        intron = [intron_token] * intron_length
        mrna = pre_intron + intron + post_intron
    if start_pad_len > 0:
        start_pad = ([untranscribed_token] * start_pad_len)
        mrna = start_pad + mrna
    if end_pad_len > 0:
        end_pad = ([untranscribed_token] * end_pad_len)
        mrna = mrna + end_pad
    gene = list(gene_sequence)
    mrna = [n if n != "T" else "U" for n in mrna]
    return gene, mrna

In [16]:
#| hide
example_annotated_gene, example_annotated_mrna = get_mrna_from_gene(
    example_gene_sequence,
    example_intron.mrna_start, example_intron.mrna_end,
    example_mrna_intron_locations,
    debug=True
)
print(example_intron.mrna_end - example_intron.mrna_start)
len(example_annotated_gene), len(example_annotated_mrna)

MRNA START:	 1808
MRNA END:	 20652
GENE SEQUENCE LENGTH:	 20652
START PAD LENGTH:	 1808
END PAD LENGTH:	 0
18844


(20652, 20652)

In [17]:
#| hide
type(example_annotated_gene), example_annotated_gene.count("T"), example_annotated_gene.count("U"), \
    example_annotated_gene.count("<intron>"), example_annotated_gene[0:5], example_annotated_gene[-5:]

(list, 3539, 0, 0, ['G', 'G', 'C', 'G', 'G'], ['G', 'C', 'C', 'T', 'G'])

In [18]:
#| hide
type(example_annotated_mrna), example_annotated_mrna.count("T"), example_annotated_mrna.count("U"), \
    example_annotated_mrna.count("<intron>"), example_annotated_mrna[0:5], example_annotated_mrna[-5:]

(list,
 0,
 385,
 16287,
 ['<null>', '<null>', '<null>', '<null>', '<null>'],
 ['G', 'C', 'C', 'U', 'G'])

## 2. Generate Training Instances

When we train, we'll want to;
- load training instances easily
- ensure training instances are the same length
- ensure the training instances are a sample of the dataset
- ensure the sample is representative of the dataset


In [19]:
#| hide
intron_locations.transcriptid.unique().size

129045

In [20]:
#| hide
mrna_length_df = intron_locations.drop_duplicates(subset=['chromosome', 'geneid', 'transcriptid'])[['mrna_start', 'mrna_end']]
total_dataset_length = (mrna_length_df.mrna_end - mrna_length_df.mrna_start).sum()
training_sequence_length = 64
num_training_instances = total_dataset_length // training_sequence_length
'{:,}'.format(total_dataset_length), '{:,}'.format(num_training_instances)

('12,496,356,010', '195,255,562')

In [21]:
#| hide
training_hours = 4
training_minutes = training_hours * 60
training_seconds = training_minutes * 60
sequences_per_second = 2.5
training_instance_target = training_seconds * sequences_per_second
training_instance_target

36000.0

In [22]:
#| hide
proportion_intron_edge = 0.25
proportion_intron = 0.25
proportion_mrna = 0.25
proportion_mrna_edge = 0.25

random_state = 42

sum([proportion_intron_edge, proportion_intron, proportion_mrna, proportion_mrna_edge])

1.0

In [23]:
#| hide
# each intron has a start edge and an end edge - lets make sure we sample both types
allowed_intron_edges = int(training_instance_target * proportion_intron_edge)
print(allowed_intron_edges)
unique_introns_edges = intron_locations.shape[0] * 2 # start and end edge
unique_introns_edges

9000


3407640

In [24]:
#| export
def sample_intron_edges(
        locations: pd.DataFrame, n: int, 
        random_state: int = 42, offset: int = -32, length: int = 64) -> pd.DataFrame:
    "Get training instances where either the start of end of an intron is in the center of the sequence."
    start_n = int(n / 2)
    end_n = n - start_n
    replace = False if start_n > locations.shape[0] else True
    starts = locations.sample(start_n, replace=replace, random_state=random_state)
    ends = locations.sample(end_n, replace=replace, random_state=random_state)
    frames = []
    for f, slice_origin in zip([starts, ends], ['intron_start', 'intron_end']):
        f_slice_start = (f[slice_origin] - f.mrna_start + offset).apply(lambda val: max(0, val))
        f.loc[:, 'mrna_len'] = f.mrna_end - f.mrna_start
        f.loc[:, 'start'] = f_slice_start
        f.loc[:, 'end'] = (f_slice_start + length)
        f.loc[:, 'end'] = f[['end', 'mrna_len']].min(axis=1)
        f = f[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        frames.append(f)
    intron_edges = pd.concat(frames, axis=0)
    intron_edges.loc[:, 'type'] = 'intron-edge'
    return intron_edges

In [25]:
#| hide
example_sample_intron_edges = sample_intron_edges(intron_locations, allowed_intron_edges, random_state=random_state)
(example_sample_intron_edges.end - example_sample_intron_edges.start).unique()

array([64, 48])

In [26]:
#| hide
intron_lengths = intron_locations.intron_end - intron_locations.intron_start
intron_lengths.describe()

count    1.703820e+06
mean     6.998604e+03
std      2.186718e+04
min      1.900000e+01
25%      6.170000e+02
50%      1.796000e+03
75%      4.935000e+03
max      1.160411e+06
dtype: float64

In [27]:
#| hide
intron_lengths.min(), intron_lengths.max()

(19, 1160411)

In [28]:
#| export
def sample_introns(
        locations: pd.DataFrame, n: int,
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    random.seed(random_state)
    "Get training instances where most of the tokens are <intron>."
    replace = False if n < locations.shape[0] else True
    intron_sample = locations.sample(n, replace=replace, random_state=random_state)
    # Handle sequences of varying sizes
    intron_sample.loc[:, 'intron_length'] = intron_sample.intron_end - intron_sample.intron_start
    intron_len_mask = intron_sample.intron_length <= length
    small_introns = intron_sample[intron_len_mask]
    large_introns = intron_sample[~intron_len_mask]
    sample_frames = []
    if small_introns.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_intron_slice_center = small_introns.intron_start
        # Shift slice center half the distance of the target sequence
        small_intron_slice_center = small_intron_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_introns.loc[:, 'start'] = small_intron_slice_center
        small_introns.loc[:, 'end'] = small_introns.start + length
        small_introns.loc[:, 'end'] = small_introns[['end', 'mrna_end']].min(axis=1)
        small_introns = small_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_introns.loc[:, 'type'] = 'intron-small'
        sample_frames.append(small_introns)
    if large_introns.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_introns.loc[:, 'slice_max'] = large_introns.intron_end - length
        large_introns.loc[:, 'slice_range'] = large_introns.apply(lambda row: range(row.intron_start, row.slice_max + 1, 1), axis=1)
        large_introns.loc[:, 'start'] = large_introns.slice_range.apply(lambda r: random.choice(r))
        large_introns.loc[:, 'end'] = large_introns.start + length
        large_introns = large_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_introns.loc[:, 'type'] = 'intron'
        sample_frames.append(large_introns)
    # Randomly select a slice point within the identified range
    introns = pd.concat(sample_frames, axis=0)
    return introns

In [29]:
#| hide
example_sample_introns = sample_introns(intron_locations, int(training_instance_target * proportion_intron), random_state=random_state)
example_sample_introns.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
1558440,NC_000019.10,GeneID:946,XM_011527538.4,57,121,intron-small
612326,NC_000006.12,GeneID:124901227,XM_047419609.1,0,64,intron-small
209595,NC_000002.12,GeneID:1768,XM_011532649.3,251297,251361,intron
926450,NC_000010.11,GeneID:657,NM_001406576.1,86904,86968,intron
584454,NC_000005.10,GeneID:3187,NM_001257293.2,2153,2217,intron


In [30]:
#| hide
(example_sample_introns.end - example_sample_introns.start).unique()

array([64])

In [31]:
#| hide
intron_locations.head(5)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [32]:
#| hide
intron_locations_sample = intron_locations.sort_values(['chromosome', 'geneid', 'transcriptid']).head(50)
intron_locations_sample_grouped = intron_locations_sample.groupby(
        ['chromosome', 'geneid', 'transcriptid']
)
intron_locations_sample.loc[:, 'mrna_seq_start'] = (
    intron_locations_sample_grouped.intron_end.shift(1).fillna(0) + \
        intron_locations_sample.mrna_start
).astype(int)
intron_locations_sample.loc[:, 'mrna_seq_end'] = intron_locations_sample.intron_start + \
        intron_locations_sample.mrna_start
display(intron_locations_sample.head())
# Add last sequence of mRNA from last intron
intron_locations_sample_last_intron = intron_locations_sample_grouped.tail(1)
intron_locations_sample_last_intron.loc[:, 'mrna_seq_start'] = intron_locations_sample_last_intron.intron_end
intron_locations_sample_last_intron.loc[:, 'mrna_seq_end'] = intron_locations_sample_last_intron.mrna_end
display(intron_locations_sample_last_intron.head())
intron_locations_sample_mrna_locations = pd.concat(
    [
        intron_locations_sample,
        intron_locations_sample_last_intron
    ], axis=0, ignore_index=True
    ).sort_values(
        ['chromosome', 'geneid', 'transcriptid']).reset_index(drop=True)
intron_locations_sample_mrna_locations.head(15)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166694,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
166695,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
166696,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489
166697,GeneID:10000,NM_001206729.2,178813,197547,7495,351356,NC_000001.11,186196,186308
166698,GeneID:10000,NM_001206729.2,197692,205842,7495,351356,NC_000001.11,205042,205187


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166706,GeneID:10000,NM_001206729.2,331261,343798,7495,351356,NC_000001.11,343798,351356
166602,GeneID:10000,NM_001370074.1,338756,345745,0,351356,NC_000001.11,345745,351356
166693,GeneID:10000,NM_005465.7,337920,344909,836,351356,NC_000001.11,344909,351356
166587,GeneID:10000,NM_181690.2,297773,330856,7797,362847,NC_000001.11,330856,362847


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
0,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
1,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
2,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489
3,GeneID:10000,NM_001206729.2,178813,197547,7495,351356,NC_000001.11,186196,186308
4,GeneID:10000,NM_001206729.2,197692,205842,7495,351356,NC_000001.11,205042,205187
5,GeneID:10000,NM_001206729.2,205974,228423,7495,351356,NC_000001.11,213337,213469
6,GeneID:10000,NM_001206729.2,228489,229845,7495,351356,NC_000001.11,235918,235984
7,GeneID:10000,NM_001206729.2,229914,270536,7495,351356,NC_000001.11,237340,237409
8,GeneID:10000,NM_001206729.2,270659,279736,7495,351356,NC_000001.11,278031,278154
9,GeneID:10000,NM_001206729.2,279865,290641,7495,351356,NC_000001.11,287231,287360


In [33]:
#| export
def get_mrna_locations(locations: pd.DataFrame) -> pd.DataFrame:
    "Get the mrna sequences between introns"
    # Get locations of transcribed dna
    locations = locations.copy()
    locations_grouped = locations.groupby(
            ['chromosome', 'geneid', 'transcriptid']
    )
    mrna_seq_start = (
        locations_grouped.intron_end.shift(1).fillna(0)
    ).astype(int)
    mrna_seq_start.name = "start"
    mrna_seq_end = locations.intron_start
    mrna_seq_end.name = "end"
    selected_cols = ['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']
    mrna_sequences = pd.concat(
        [
            locations[selected_cols],
            mrna_seq_start,
            mrna_seq_end
        ], axis=1)
    # Add last sequence of mRNA from last intron
    last_intron = locations_grouped.tail(1)
    last_intron.loc[:, 'start'] = last_intron.intron_end
    last_intron.loc[:, 'end'] = last_intron.mrna_end
    all_mrna_sequences = pd.concat(
        [
            mrna_sequences,
            last_intron[selected_cols + ['start', 'end']]
        ], axis=0, ignore_index=True
    ).sort_values(['chromosome', 'geneid', 'transcriptid', 'start']).reset_index(drop=True)
    return all_mrna_sequences

In [34]:
#| hide
mrna_locations = get_mrna_locations(intron_locations)
mrna_locations.head()

Unnamed: 0,chromosome,geneid,transcriptid,mrna_start,mrna_end,start,end
0,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,0,41
1,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,302,460
2,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,147868,147994
3,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,178701,178813
4,NC_000001.11,GeneID:10000,NM_001206729.2,7495,351356,197547,197692


In [35]:
#| hide
mrna_locations.shape

(1832931, 7)

In [36]:
#| hide
first_sample_mrna = mrna_locations.iloc[0, :]
intron_locations[
    (intron_locations.chromosome == first_sample_mrna.chromosome) &
    (intron_locations.geneid == first_sample_mrna.geneid) &
    (intron_locations.transcriptid == first_sample_mrna.transcriptid)
].head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
166694,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11
166695,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11
166696,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11
166697,GeneID:10000,NM_001206729.2,178813,197547,7495,351356,NC_000001.11
166698,GeneID:10000,NM_001206729.2,197692,205842,7495,351356,NC_000001.11


In [37]:
#| export
def sample_mrna(
        mrna_locations: pd.DataFrame, n: int, 
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get a sample or mrna sequence locations"
    replace = False if n < mrna_locations.shape[0] else True
    mrna_locations = mrna_locations.sample(n, replace=replace, random_state=random_state)
    # For small mrna sections, do the same thing we did with the introns
    # Handle sequences of varying sizes
    mrna_locations.loc[:, 'length'] = mrna_locations.end - mrna_locations.start
    mrna_len_mask = mrna_locations.length <= length
    small_sequences = mrna_locations[mrna_len_mask]
    large_sequences = mrna_locations[~mrna_len_mask]
    sample_frames = []
    if small_sequences.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_sequences_slice_center = small_sequences.start
        # Shift slice center half the distance of the target sequence
        small_sequences_slice_center = small_sequences_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_sequences.loc[:, 'start'] = small_sequences_slice_center
        small_sequences.loc[:, 'end'] = small_sequences.start + length
        small_sequences.loc[:, 'end'] = small_sequences[['end', 'mrna_end']].min(axis=1)
        small_sequences = small_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_sequences.loc[:, 'type'] = 'mrna-small'
        sample_frames.append(small_sequences)
    if large_sequences.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_sequences.loc[:, 'slice_max'] = large_sequences.end - length
        large_sequences.loc[:, 'slice_range'] = large_sequences.apply(lambda row: range(row.start, row.slice_max + 1, 1), axis=1)
        large_sequences.loc[:, 'start'] = large_sequences.slice_range.apply(lambda r: random.choice(r))
        large_sequences.loc[:, 'end'] = large_sequences.start + length
        large_sequences = large_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_sequences.loc[:, 'type'] = 'mrna'
        sample_frames.append(large_sequences)
    # Randomly select a slice point within the identified range
    return pd.concat(sample_frames, axis=0, ignore_index=True)

In [38]:
#| hide
example_sample_mrna = sample_mrna(mrna_locations, int(proportion_mrna * training_instance_target))
example_sample_mrna.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000012.12,GeneID:4033,NM_001366547.2,37025,37089,mrna-small
1,NC_000019.10,GeneID:57153,XM_047439113.1,28988,29052,mrna-small
2,NC_000011.10,GeneID:6833,NM_000352.6,59851,59915,mrna-small
3,NC_000018.10,GeneID:374860,XM_011525664.3,21139,21203,mrna-small
4,NC_000009.12,GeneID:56904,XM_005252098.3,16001,16065,mrna-small


In [50]:
#| export
def sample_mrna_edges(locations: pd.DataFrame, n: int, random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get the beginning and end of mrna"
    locations = locations.drop_duplicates(
        ['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']
    ).drop(['intron_start', 'intron_end'], axis=1).reset_index(drop=True)
    n_start = int(n / 2)
    n_end = n - n_start
    replace = False if (n_start < locations.shape[0]) or (n_end < locations.shape[0]) else True
    mrna_starts = locations.sample(
        n_start, replace=replace, random_state=random_state
        ).rename({'mrna_start': 'start'}, axis=1)
    mrna_starts.loc[:, 'end'] = mrna_starts.start + length
    mrna_starts.loc[:, 'end'] = mrna_starts[['mrna_end', 'end']].min(axis=1)
    mrna_starts.drop(['mrna_end'], axis=1, inplace=True)
    mrna_ends = locations.sample(
        n_end, replace=replace, random_state=random_state).rename({'mrna_end': 'end'}, axis=1)
    mrna_ends.loc[:, 'start'] = mrna_ends.end - length
    mrna_ends.loc[:, 'start'] = mrna_ends[['mrna_start', 'start']].max(axis=1)
    mrna_ends.drop(['mrna_start'], axis=1, inplace=True)
    sample_edges = pd.concat([mrna_starts, mrna_ends], axis=0, ignore_index=True)
    sample_edges = sample_edges[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
    sample_edges.loc[:, 'type'] = 'mrna-edge'
    return sample_edges

In [51]:
#| hide
example_sample_mrna_edges = sample_mrna_edges(intron_locations, int(proportion_mrna_edge * training_instance_target))
example_sample_mrna_edges.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000022.11,GeneID:6525,XM_011530338.3,17200,17264,mrna-edge
1,NC_000006.12,GeneID:8676,XM_047419441.1,10554,10618,mrna-edge
2,NC_000008.11,GeneID:138046,NM_001354315.2,487,551,mrna-edge
3,NC_000005.10,GeneID:80006,NM_001365343.1,0,64,mrna-edge
4,NC_000004.12,GeneID:419,XM_047415699.1,0,64,mrna-edge


In [42]:
#| export
def sample_sequences_idx(
        n: int, 
        intron_locations: pd.DataFrame,
        mrna_locations: pd.DataFrame,
        intron_prop: float, intron_edge_prop: float, 
        mrna_prop: float, mrna_edge_prop: float,
        random_state: int = 42,
        length: int = 64) -> pd.DataFrame:
    "Build training dataset from intron locations."
    intron_sample = sample_introns(
        intron_locations, int(n * intron_prop), 
        random_state=random_state, length=length)
    intron_edge_sample = sample_intron_edges(
        intron_locations, int(n * intron_edge_prop), 
        random_state=random_state, length=length)
    mrna_sample = sample_mrna(mrna_locations, int(n * mrna_prop), 
        random_state=random_state, length=length)
    mrna_edge_sample = sample_mrna_edges(intron_locations, int(n * mrna_edge_prop),
        random_state=random_state, length=length)
    sample = pd.concat([
        intron_sample,
        intron_edge_sample,
        mrna_sample,
        mrna_edge_sample
    ], axis=0, ignore_index=True)
    return sample

In [43]:
#| hide
example_sample_sequence_idx = sample_sequences_idx(
    500,
    intron_locations,
    mrna_locations,
    intron_prop=proportion_intron,
    intron_edge_prop=proportion_intron_edge,
    mrna_prop=proportion_mrna,
    mrna_edge_prop=proportion_mrna_edge,
    random_state=random_state,
)
example_sample_sequence_idx['type'].value_counts()

type
intron         125
intron-edge    125
mrna-edge      125
mrna           116
mrna-small       9
Name: count, dtype: int64

In [44]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000002.12,GeneID:1768,XM_011532649.3,251297,251361,intron
1,NC_000010.11,GeneID:657,NM_001406576.1,86904,86968,intron
2,NC_000005.10,GeneID:3187,NM_001257293.2,2153,2217,intron
3,NC_000004.12,GeneID:84286,XM_017008701.2,4587,4651,intron
4,NC_000005.10,GeneID:2533,NM_018594.2,86359,86423,intron


In [45]:
#| export
def get_training_sequences_with_idx(
        chromosome: str, geneid: str, transcriptid: str, 
        genes: pd.DataFrame,
        intron_locations: pd.DataFrame,
        start: int, end: int,
        debug: bool = False
        ) -> tuple[str, str, int]:
    ""
    time_start = time.time()
    gene_sequence = get_gene(genes, geneid)
    read_gene_time = time.time()
    intron_locations = intron_locations[
        (intron_locations.chromosome == chromosome) &
        (intron_locations.geneid == geneid) &
        (intron_locations.transcriptid == transcriptid)
    ]
    mrna_start = intron_locations.iloc[0, :].mrna_start
    mrna_end = intron_locations.iloc[0, :].mrna_end
    intron_list = get_mrna_intron_locations(
        chromosome, geneid, transcriptid, intron_locations
    )
    mrna_intron_list_time = time.time()
    annotated_gene, annotated_mrna = get_mrna_from_gene(
        gene_sequence, 
        mrna_start, mrna_end, 
        intron_list)
    annotation_sequences_time = time.time()
    if debug:
        print("Time to read gene:", timedelta(seconds=read_gene_time - time_start))
        print("Time to get list of introns:", timedelta(seconds=mrna_intron_list_time - read_gene_time))
        print("Time to annotate sequences:", timedelta(seconds=annotation_sequences_time - mrna_intron_list_time))
        print("Total time:", timedelta(seconds=annotation_sequences_time - time_start))
    return annotated_gene[start: end], annotated_mrna[start: end], start

In [46]:
#| hide
example_idx_row = example_sample_sequence_idx.iloc[0, :]
print(example_idx_row)
example_training_sequence_gene, example_training_sequence_mrna, example_training_sequence_position = get_training_sequences_with_idx(
    example_idx_row.chromosome, example_idx_row.geneid, example_idx_row.transcriptid, 
    genes,
    intron_locations,
    example_idx_row.start, example_idx_row.end,
    debug=True
)
len(example_training_sequence_gene), len(example_training_sequence_mrna), example_training_sequence_position

chromosome        NC_000002.12
geneid             GeneID:1768
transcriptid    XM_011532649.3
start                   251297
end                     251361
type                    intron
Name: 0, dtype: object
Time to read gene: 0:00:00.007806
Time to get list of introns: 0:00:00.193654
Time to annotate sequences: 0:00:00.188481
Total time: 0:00:00.389941


(64, 64, 251297)

In [48]:
#| hide
example_training_data = example_sample_sequence_idx.head(10).progress_apply(
    lambda row: get_training_sequences_with_idx(
        row.chromosome, row.geneid, row.transcriptid, 
        genes,
        intron_locations,
        row.start, row.end
    ), axis=1)
example_training_data.head()

100%|██████████| 10/10 [00:02<00:00,  4.17it/s]


0    ([A, A, G, A, G, G, A, A, G, A, A, G, A, A, G,...
1    ([A, C, A, A, A, G, A, C, A, T, A, C, C, C, A,...
2    ([A, G, A, G, G, G, C, G, T, G, C, T, C, G, G,...
3    ([T, A, A, C, C, A, T, C, T, T, T, T, T, T, T,...
4    ([A, C, T, G, G, A, T, C, A, C, A, A, A, G, A,...
dtype: object

In [52]:
#| hide
import nbdev; nbdev.nbdev_export()