# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Construct the mRNA sequence
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA

In [59]:
#| default_exp training.transcription.sampling

In [60]:
#| hide
from nbdev.showdoc import *

## 0. Setup

In [61]:
#| export
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import random
import warnings
from tqdm import tqdm
from multiprocessing import current_process
import sqlite3

warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.data.transcription import get_genes
from llm_mito_scanner.training.transcription.index import get_intron_locations
from llm_mito_scanner.training.transcription.generation import get_mrna_locations, get_mrna

In [62]:
#| hide
config = load_config()

In [63]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
gene_database_path = latest_assembly_path / "genes.db"
mrna_database_path = latest_assembly_path / "mrna.db"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [gene_database_path, mrna_database_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

## Sampling

In [64]:
#| hide
example_chromosome = "NC_000001.11"

In [65]:
#| hide
intron_locations = get_intron_locations(
    intron_locations_path,
    chromosome=example_chromosome
)

In [66]:
#| hide
intron_locations.transcriptid.unique().size

12720

In [67]:
#| hide
mrna_length_df = intron_locations.drop_duplicates(subset=['chromosome', 'geneid', 'transcriptid'])[['mrna_start', 'mrna_end']]
total_dataset_length = (mrna_length_df.mrna_end - mrna_length_df.mrna_start).sum() * 24
training_sequence_length = 64
num_training_instances = total_dataset_length // training_sequence_length
'{:,}'.format(total_dataset_length), '{:,}'.format(num_training_instances)

('24,893,090,448', '388,954,538')

In [68]:
#| hide
training_hours = 6
training_minutes = training_hours * 60
training_seconds = training_minutes * 60
sequences_per_second = 2.5 * 32
training_instance_target = training_seconds * sequences_per_second
training_instance_target

1728000.0

In [69]:
#| hide
proportion_intron_edge = 0.25
proportion_intron = 0.25
proportion_mrna = 0.25
proportion_mrna_edge = 0.25

random_state = 42

sum([proportion_intron_edge, proportion_intron, proportion_mrna, proportion_mrna_edge])

1.0

In [70]:
#| hide
# each intron has a start edge and an end edge - lets make sure we sample both types
allowed_intron_edges = int(training_instance_target * proportion_intron_edge)
print(allowed_intron_edges)
unique_introns_edges = intron_locations.shape[0] * 2 # start and end edge
unique_introns_edges

432000


337270

In [71]:
#| export
def sample_intron_edges(
        locations: pd.DataFrame, n: int, 
        random_state: int = 42, offset: int = -32, length: int = 64) -> pd.DataFrame:
    "Get training instances where either the start of end of an intron is in the center of the sequence."
    start_n = int(n / 2)
    end_n = n - start_n
    replace = True if (start_n > locations.shape[0]) | (end_n > locations.shape[0]) else False
    starts = locations.sample(start_n, replace=replace, random_state=random_state)
    ends = locations.sample(end_n, replace=replace, random_state=random_state)
    frames = []
    for f, slice_origin in zip([starts, ends], ['intron_start', 'intron_end']):
        f_slice_start = (f[slice_origin] - f.mrna_start + offset).apply(lambda val: max(0, val))
        f.loc[:, 'mrna_len'] = f.mrna_end - f.mrna_start
        f.loc[:, 'start'] = f_slice_start
        f.loc[:, 'end'] = (f_slice_start + length)
        f.loc[:, 'end'] = f[['end', 'mrna_len']].min(axis=1)
        # Detect short sequences
        short_mask = (f.end - f.start) != length
        short_end_mask = short_mask & (f.end == f.mrna_len)
        short_start_mask = short_mask & (f.start == 0)
        if short_end_mask.sum() > 0:
            # Fix the samples with short ends
            f_short_ends = f.loc[short_end_mask, :]
            f_short_lengths = f_short_ends.end - f_short_ends.start
            f_short_adjustments = f_short_lengths - length
            f_short_ends.loc[:, 'start'] = f_short_ends.start + f_short_adjustments
            f.loc[short_end_mask, :] = f_short_ends
        if short_start_mask.sum() > 0:
            # Fix the samples with short starts
            f_short_starts = f.loc[short_start_mask, :]
            f_short_lengths = f_short_starts.end - f_short_starts.start
            f_short_adjustments = (f_short_lengths - length).mul(-1)
            f_short_starts.loc[:, 'end'] = f_short_starts.end + f_short_adjustments
            f.loc[short_start_mask, :] = f_short_starts
        f = f[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        frames.append(f)
    intron_edges = pd.concat(frames, axis=0)
    intron_edges.loc[:, 'type'] = 'intron-edge'
    return intron_edges

In [72]:
#| hide
example_sample_intron_edges = sample_intron_edges(intron_locations, allowed_intron_edges, random_state=random_state)
example_sample_intron_edges.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
121958,NC_000001.11,GeneID:5202,XM_011509624.4,7442,7506,intron-edge
146867,NC_000001.11,GeneID:93273,XM_011510162.3,0,64,intron-edge
131932,NC_000001.11,GeneID:7827,XM_017002298.2,16238,16302,intron-edge
103694,NC_000001.11,GeneID:84072,XM_047431826.1,9319,9383,intron-edge
119879,NC_000001.11,GeneID:115350,NM_052938.5,22221,22285,intron-edge


In [73]:
#| hide
example_sample_intron_edges_lengths = example_sample_intron_edges.end - example_sample_intron_edges.start
display(example_sample_intron_edges_lengths.value_counts())
bad_length_mask = example_sample_intron_edges_lengths != 64
bad_length_mask.value_counts()

64    432000
Name: count, dtype: int64

False    432000
Name: count, dtype: int64

In [74]:
#| hide
example_sample_intron_edges[bad_length_mask]

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type


In [75]:
#| hide
intron_lengths = intron_locations.intron_end - intron_locations.intron_start
intron_lengths.describe()

count    168635.000000
mean       5827.990850
std       17118.083674
min          33.000000
25%         543.000000
50%        1534.000000
75%        4281.000000
max      451448.000000
dtype: float64

In [76]:
#| hide
intron_lengths.min(), intron_lengths.max()

(33, 451448)

In [77]:
#| export
def sample_introns(
        locations: pd.DataFrame, n: int,
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    random.seed(random_state)
    "Get training instances where most of the tokens are <intron>."
    replace = False if n < locations.shape[0] else True
    intron_sample = locations.sample(n, replace=replace, random_state=random_state)
    # Handle sequences of varying sizes
    intron_sample.loc[:, 'intron_length'] = intron_sample.intron_end - intron_sample.intron_start
    intron_len_mask = intron_sample.intron_length <= length
    small_introns = intron_sample[intron_len_mask]
    large_introns = intron_sample[~intron_len_mask]
    sample_frames = []
    if small_introns.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_intron_slice_center = small_introns.intron_start
        # Shift slice center half the distance of the target sequence
        small_intron_slice_center = small_intron_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_introns.loc[:, 'start'] = small_intron_slice_center
        small_introns.loc[:, 'end'] = small_introns.start + length
        small_introns.loc[:, 'end'] = small_introns[['end', 'mrna_end']].min(axis=1)
        small_introns = small_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_introns.loc[:, 'type'] = 'intron-small'
        sample_frames.append(small_introns)
    if large_introns.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_introns.loc[:, 'slice_max'] = large_introns.intron_end - length
        large_introns.loc[:, 'slice_range'] = large_introns.apply(lambda row: range(row.intron_start, row.slice_max + 1, 1), axis=1)
        large_introns.loc[:, 'start'] = large_introns.slice_range.apply(lambda r: random.choice(r))
        large_introns.loc[:, 'end'] = large_introns.start + length
        large_introns = large_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_introns.loc[:, 'type'] = 'intron'
        sample_frames.append(large_introns)
    # Randomly select a slice point within the identified range
    introns = pd.concat(sample_frames, axis=0)
    return introns

In [78]:
#| hide
example_sample_introns = sample_introns(intron_locations, int(training_instance_target * proportion_intron), random_state=random_state)
example_sample_introns.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
112692,NC_000001.11,GeneID:55974,XM_047425465.1,2215,2279,intron-small
4987,NC_000001.11,GeneID:124903828,XM_047436627.1,27191,27255,intron-small
112683,NC_000001.11,GeneID:55974,XM_006711453.3,2215,2279,intron-small
9316,NC_000001.11,GeneID:8863,NM_001289863.3,45629,45693,intron-small
9316,NC_000001.11,GeneID:8863,NM_001289863.3,45629,45693,intron-small


In [79]:
#| hide
(example_sample_introns.end - example_sample_introns.start).unique()

array([64])

In [80]:
#| hide
intron_locations.head(5)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [81]:
#| hide
intron_locations_sample = intron_locations.sort_values(['chromosome', 'geneid', 'transcriptid']).head(50)
intron_locations_sample_grouped = intron_locations_sample.groupby(
        ['chromosome', 'geneid', 'transcriptid']
)
intron_locations_sample.loc[:, 'mrna_seq_start'] = (
    intron_locations_sample_grouped.intron_end.shift(1).fillna(0) + \
        intron_locations_sample.mrna_start
).astype(int)
intron_locations_sample.loc[:, 'mrna_seq_end'] = intron_locations_sample.intron_start + \
        intron_locations_sample.mrna_start
display(intron_locations_sample.head(3))
# Add last sequence of mRNA from last intron
intron_locations_sample_last_intron = intron_locations_sample_grouped.tail(1)
intron_locations_sample_last_intron.loc[:, 'mrna_seq_start'] = intron_locations_sample_last_intron.intron_end
intron_locations_sample_last_intron.loc[:, 'mrna_seq_end'] = intron_locations_sample_last_intron.mrna_end
display(intron_locations_sample_last_intron.head(3))
intron_locations_sample_mrna_locations = pd.concat(
    [
        intron_locations_sample,
        intron_locations_sample_last_intron
    ], axis=0, ignore_index=True
    ).sort_values(
        ['chromosome', 'geneid', 'transcriptid']).reset_index(drop=True)
intron_locations_sample_mrna_locations.head(3)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166694,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
166695,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
166696,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166706,GeneID:10000,NM_001206729.2,331261,343798,7495,351356,NC_000001.11,343798,351356
166602,GeneID:10000,NM_001370074.1,338756,345745,0,351356,NC_000001.11,345745,351356
166693,GeneID:10000,NM_005465.7,337920,344909,836,351356,NC_000001.11,344909,351356


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
0,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
1,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
2,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489


In [82]:
#| export
def sample_mrna(
        mrna_locations: pd.DataFrame, n: int, 
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get a sample or mrna sequence locations"
    replace = False if n < mrna_locations.shape[0] else True
    mrna_locations = mrna_locations.sample(n, replace=replace, random_state=random_state)
    # For small mrna sections, do the same thing we did with the introns
    # Handle sequences of varying sizes
    mrna_locations.loc[:, 'length'] = mrna_locations.end - mrna_locations.start
    mrna_len_mask = mrna_locations.length <= length
    small_sequences = mrna_locations[mrna_len_mask]
    large_sequences = mrna_locations[~mrna_len_mask]
    sample_frames = []
    if small_sequences.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_sequences_slice_center = small_sequences.start
        # Shift slice center half the distance of the target sequence
        small_sequences_slice_center = small_sequences_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_sequences.loc[:, 'start'] = small_sequences_slice_center
        small_sequences.loc[:, 'end'] = small_sequences.start + length
        small_sequences.loc[:, 'end'] = small_sequences[['end', 'mrna_end']].min(axis=1)
        small_sequences = small_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_sequences.loc[:, 'type'] = 'mrna-small'
        sample_frames.append(small_sequences)
    if large_sequences.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_sequences.loc[:, 'slice_max'] = large_sequences.end - length
        large_sequences.loc[:, 'slice_range'] = large_sequences.apply(lambda row: range(row.start, row.slice_max + 1, 1), axis=1)
        large_sequences.loc[:, 'start'] = large_sequences.slice_range.apply(lambda r: random.choice(r))
        large_sequences.loc[:, 'end'] = large_sequences.start + length
        large_sequences = large_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_sequences.loc[:, 'type'] = 'mrna'
        sample_frames.append(large_sequences)
    # Randomly select a slice point within the identified range
    return pd.concat(sample_frames, axis=0, ignore_index=True)

In [83]:
#| hide
mrna_locations = get_mrna_locations(intron_locations)

In [84]:
#| hide
example_sample_mrna = sample_mrna(mrna_locations, int(proportion_mrna * training_instance_target))
example_sample_mrna.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:55740,XM_024448311.2,154687,154751,mrna-small
1,NC_000001.11,GeneID:405,XM_005245153.2,30260,30324,mrna-small
2,NC_000001.11,GeneID:5909,NM_001388231.1,51353,51417,mrna-small
3,NC_000001.11,GeneID:5909,NM_001388295.1,38637,38701,mrna-small
4,NC_000001.11,GeneID:353134,NM_178352.3,0,64,mrna-small


In [85]:
#| export
def sample_mrna_edges(locations: pd.DataFrame, n: int, random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get the beginning and end of mrna"
    locations = locations.drop_duplicates(
        ['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']
    ).drop(['intron_start', 'intron_end'], axis=1).reset_index(drop=True)
    n_start = int(n / 2)
    n_end = n - n_start
    replace = False if (n_start < locations.shape[0]) or (n_end < locations.shape[0]) else True
    mrna_starts = locations.sample(
        n_start, replace=replace, random_state=random_state
        ).rename({'mrna_start': 'start'}, axis=1)
    mrna_starts.loc[:, 'end'] = mrna_starts.start + length
    mrna_starts.loc[:, 'end'] = mrna_starts[['mrna_end', 'end']].min(axis=1)
    mrna_starts.drop(['mrna_end'], axis=1, inplace=True)
    mrna_ends = locations.sample(
        n_end, replace=replace, random_state=random_state).rename({'mrna_end': 'end'}, axis=1)
    mrna_ends.loc[:, 'start'] = mrna_ends.end - length
    mrna_ends.loc[:, 'start'] = mrna_ends[['mrna_start', 'start']].max(axis=1)
    mrna_ends.drop(['mrna_start'], axis=1, inplace=True)
    sample_edges = pd.concat([mrna_starts, mrna_ends], axis=0, ignore_index=True)
    sample_edges = sample_edges[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
    sample_edges.loc[:, 'type'] = 'mrna-edge'
    return sample_edges

In [86]:
#| hide
example_sample_mrna_edges = sample_mrna_edges(intron_locations, int(proportion_mrna_edge * training_instance_target))
example_sample_mrna_edges.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:9659,NM_001395320.1,36075,36139,mrna-edge
1,NC_000001.11,GeneID:5293,XM_047422571.1,24473,24537,mrna-edge
2,NC_000001.11,GeneID:11080,NM_007034.5,25764,25828,mrna-edge
3,NC_000001.11,GeneID:81573,NM_030816.5,0,64,mrna-edge
4,NC_000001.11,GeneID:116841,XM_017000232.2,463,527,mrna-edge


In [87]:
#| export
def sample_sequences_idx(
        n: int, 
        intron_locations: pd.DataFrame,
        mrna_locations: pd.DataFrame,
        intron_prop: float, intron_edge_prop: float, 
        mrna_prop: float, mrna_edge_prop: float,
        random_state: int = 42,
        length: int = 64) -> pd.DataFrame:
    "Build training dataset from intron locations."
    intron_sample = sample_introns(
        intron_locations, int(n * intron_prop), 
        random_state=random_state, length=length)
    intron_edge_sample = sample_intron_edges(
        intron_locations, int(n * intron_edge_prop), 
        random_state=random_state, length=length)
    mrna_sample = sample_mrna(mrna_locations, int(n * mrna_prop), 
        random_state=random_state, length=length)
    mrna_edge_sample = sample_mrna_edges(intron_locations, int(n * mrna_edge_prop),
        random_state=random_state, length=length)
    sample = pd.concat([
        intron_sample,
        intron_edge_sample,
        mrna_sample,
        mrna_edge_sample
    ], axis=0, ignore_index=True)
    return sample

In [88]:
#| hide
example_sample_sequence_idx = sample_sequences_idx(
    500,
    intron_locations,
    mrna_locations,
    intron_prop=proportion_intron,
    intron_edge_prop=proportion_intron_edge,
    mrna_prop=proportion_mrna,
    mrna_edge_prop=proportion_mrna_edge,
    random_state=random_state,
)
example_sample_sequence_idx['type'].value_counts()

type
intron         125
intron-edge    125
mrna-edge      125
mrna           114
mrna-small      11
Name: count, dtype: int64

In [89]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron


## Getting sequences from sample indices

In [90]:
#| export
def get_training_sequences_with_idx(
        gene: list[str], mrna: list[str],
        start: int, end: int,
        ) -> tuple[str, str]:
    ""
    return ",".join(gene[start: end]), ",".join(mrna[start: end])

In [91]:
#| hide
example_idx_row = example_sample_sequence_idx.iloc[0, :]
example_idx_row

chromosome        NC_000001.11
geneid            GeneID:83872
transcriptid    XM_011510038.4
start                   402874
end                     402938
type                    intron
Name: 0, dtype: object

In [92]:
#| export
def tokenize_gene(gene: str) -> list[str]:
    return list(gene)


def tokenize_mrna(mrna: str) -> list[str]:
    return mrna.split(',')

In [93]:
#| hide
len(tokenize_gene(get_genes(latest_assembly_path, 
        columns=['sequence'],
        chromosome=example_idx_row.chromosome, 
        gene_ids=[example_idx_row.geneid]).sequence.iloc[0]))

456559

In [94]:
#| hide
len(tokenize_mrna(get_mrna(
        latest_assembly_path,
        columns=['sequence'],
        chromosome=example_idx_row.chromosome,
        gene_ids=[example_idx_row.geneid], 
        transcript_ids=[example_idx_row.transcriptid]).sequence.iloc[0]))

456559

In [95]:
#| hide
example_training_sequence_gene, example_training_sequence_mrna = get_training_sequences_with_idx(
    tokenize_gene(
        get_genes(
            latest_assembly_path, 
            columns=['sequence'],
            chromosome=example_idx_row.chromosome, 
            gene_ids=[example_idx_row.geneid]).sequence.iloc[0]),
    tokenize_mrna(
        get_mrna(
            latest_assembly_path,
            columns=['sequence'],
            chromosome=example_idx_row.chromosome,
            gene_ids=[example_idx_row.geneid], 
            transcript_ids=[example_idx_row.transcriptid]).sequence.iloc[0]),
    example_idx_row.start, example_idx_row.end,
)
len(example_training_sequence_gene), len(example_training_sequence_mrna)

(127, 575)

In [96]:
#| hide
len(example_training_sequence_gene.split(',')), len(example_training_sequence_mrna.split(","))

(64, 64)

In [97]:
#| hide
type(example_training_sequence_gene), type(example_training_sequence_mrna)

(str, str)

In [98]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron


In [99]:
#| hide
example_sample_sequence_idx.shape

(500, 6)

In [100]:
#| export
def get_gene_transcript_samples(
    chromosome: str,
    geneid: str,
    transcript_id: str,
    sample_idx: list[list[int, int]],
    assembly_path: Path,
    sep: str = ",",
    gene_connection: sqlite3.Connection | None = None,
    mrna_connection: sqlite3.Connection | None = None) -> list[tuple[int, str, str]]:
    gene_str_list = tokenize_gene(
        get_genes(
            assembly_path, chromosome=chromosome, 
            gene_ids=[geneid],
            con=gene_connection
        ).sequence.iloc[0])
    mrna_str_list = tokenize_mrna(
        get_mrna(
            assembly_path, chromosome=chromosome, 
            gene_ids=[geneid], transcript_ids=[transcript_id],
            con=mrna_connection).sequence.iloc[0])
    samples = []
    for start, end in sample_idx:
        samples.append(
            (
                sep.join(gene_str_list[start: end]),
                sep.join(mrna_str_list[start: end])
            )
        )
    return samples

In [101]:
#| hide
example_sample_sequence_idx.groupby(["chromosome", "geneid", "transcriptid"]).type.count().sort_values().tail()

chromosome    geneid         transcriptid  
NC_000001.11  GeneID:6262    XM_047427337.1    3
              GeneID:200159  XM_011544129.4    3
              GeneID:149483  XM_011540820.2    3
              GeneID:64175   XM_047427621.1    3
              GeneID:81627   NM_001202423.2    5
Name: type, dtype: int64

In [102]:
#| hide
example_idx_chromosome = "NC_000001.11"
example_idx_geneid = "GeneID:81627"
example_idx_transcriptid = "NM_001202423.2"

example_idx_df = example_sample_sequence_idx[
    (example_sample_sequence_idx.chromosome == example_idx_chromosome) &
    (example_sample_sequence_idx.geneid == example_idx_geneid) &
    (example_sample_sequence_idx.transcriptid == example_idx_transcriptid)]# [['start', 'end']].values.tolist()
example_idx_idx = example_idx_df[['start', 'end']].values.tolist()
example_types = example_idx_df['type'].tolist()
example_idx_idx

[[11302, 11366], [9446, 9510], [11059, 11123], [439, 503], [39373, 39437]]

In [103]:
#| hide
example_idx_sequences = get_gene_transcript_samples(
    example_idx_chromosome,
    example_idx_geneid,
    example_idx_transcriptid,
    example_idx_idx,
    latest_assembly_path
)
example_idx_sequences

[('C,A,A,G,C,T,G,G,T,T,G,T,T,T,A,G,T,A,T,G,G,C,C,T,C,C,A,T,T,A,A,A,A,A,T,T,A,A,G,T,T,A,T,A,T,A,A,A,C,C,T,A,A,A,A,A,T,G,A,A,T,G,A,G',
  '<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>'),
 ('T,G,G,A,G,C,A,C,T,T,C,A,G,C,A,T,A,T,A,C,A,A,G,T,A,A,G,A,A,A,T,G,A,T,C,G,A,A,T,C,T,A,A,G,G,T,A,T,A,A,G,A,A,A,A,A,T,G,C,A,T,T,A,G',
  '<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intron>,<intr

In [104]:
#| hide
example_idx_sequences_frame = pd.DataFrame(example_idx_sequences, columns=['input', 'target'])
example_idx_sequences_frame.head()

Unnamed: 0,input,target
0,"C,A,A,G,C,T,G,G,T,T,G,T,T,T,A,G,T,A,T,G,G,C,C,...","<intron>,<intron>,<intron>,<intron>,<intron>,<..."
1,"T,G,G,A,G,C,A,C,T,T,C,A,G,C,A,T,A,T,A,C,A,A,G,...","<intron>,<intron>,<intron>,<intron>,<intron>,<..."
2,"A,G,G,A,A,C,C,T,T,G,G,G,T,A,A,T,G,T,G,C,T,T,C,...","<intron>,<intron>,<intron>,<intron>,<intron>,<..."
3,"A,G,A,C,T,G,C,G,C,A,G,T,G,T,A,T,C,C,T,G,G,G,T,...","A,G,A,C,U,G,C,G,C,A,G,U,G,U,A,U,C,C,U,G,G,G,U,..."
4,"A,A,A,T,G,T,T,T,A,T,C,A,T,G,T,T,G,T,A,A,A,G,T,...","A,A,A,U,G,U,U,U,A,U,C,A,U,G,U,U,G,U,A,A,A,G,U,..."


In [105]:
#| hide
example_idx_sequences_frame.input.str.split(",").apply(len).value_counts(),\
example_idx_sequences_frame.target.str.split(",").apply(len).value_counts()

(input
 64    5
 Name: count, dtype: int64,
 target
 64    5
 Name: count, dtype: int64)

In [106]:
#| export
def get_gene_transcript_samples_wrapper(args) -> pd.DataFrame:
    chromosome = args.get("chromosome")
    gene_id = args.get('geneid')
    transcript_id = args.get("transcriptid")
    sample_idx = args.get("index")
    index_types = args.get("types")
    assembly_path = args.get("assembly_path")
    sequences_list = get_gene_transcript_samples(
        chromosome,
        gene_id,
        transcript_id,
        sample_idx,
        assembly_path
    )
    sequences_frame = pd.DataFrame(
        sequences_list, columns=['input', 'target'])
    sequences_frame.loc[:, 'chromosome'] = chromosome
    sequences_frame.loc[:, 'geneid'] = gene_id
    sequences_frame.loc[:, 'transcriptid'] = transcript_id
    sequences_frame.loc[:, 'type'] = index_types
    idx_df = pd.DataFrame(sample_idx, columns=['start', 'end'])
    sequences_frame = pd.concat([sequences_frame, idx_df], axis=1)
    return sequences_frame

In [107]:
#| hide
get_gene_transcript_samples_wrapper(
    {
        "chromosome": example_idx_chromosome,
        "geneid": example_idx_geneid,
        "transcriptid": example_idx_transcriptid,
        "index": example_idx_idx,
        "types": example_types,
        "assembly_path": latest_assembly_path,
    }
)

Unnamed: 0,input,target,chromosome,geneid,transcriptid,type,start,end
0,"C,A,A,G,C,T,G,G,T,T,G,T,T,T,A,G,T,A,T,G,G,C,C,...","<intron>,<intron>,<intron>,<intron>,<intron>,<...",NC_000001.11,GeneID:81627,NM_001202423.2,intron,11302,11366
1,"T,G,G,A,G,C,A,C,T,T,C,A,G,C,A,T,A,T,A,C,A,A,G,...","<intron>,<intron>,<intron>,<intron>,<intron>,<...",NC_000001.11,GeneID:81627,NM_001202423.2,intron-edge,9446,9510
2,"A,G,G,A,A,C,C,T,T,G,G,G,T,A,A,T,G,T,G,C,T,T,C,...","<intron>,<intron>,<intron>,<intron>,<intron>,<...",NC_000001.11,GeneID:81627,NM_001202423.2,intron-edge,11059,11123
3,"A,G,A,C,T,G,C,G,C,A,G,T,G,T,A,T,C,C,T,G,G,G,T,...","A,G,A,C,U,G,C,G,C,A,G,U,G,U,A,U,C,C,U,G,G,G,U,...",NC_000001.11,GeneID:81627,NM_001202423.2,mrna-edge,439,503
4,"A,A,A,T,G,T,T,T,A,T,C,A,T,G,T,T,G,T,A,A,A,G,T,...","A,A,A,U,G,U,U,U,A,U,C,A,U,G,U,U,G,U,A,A,A,G,U,...",NC_000001.11,GeneID:81627,NM_001202423.2,mrna-edge,39373,39437


In [108]:
#| hide
example_sample_sequence_idx.loc[:, "idx"] = pd.Series(zip(example_sample_sequence_idx.start, example_sample_sequence_idx.end))
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type,idx
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron,"(402874, 402938)"
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron,"(7411, 7475)"
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron,"(52326, 52390)"
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron,"(1312, 1376)"
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron,"(40411, 40475)"


In [109]:
#| hide
example_sample_idx_grouped = example_sample_sequence_idx.groupby(["chromosome", "geneid", "transcriptid"]).agg({
    "idx": list,
    "type": list
})
example_sample_idx_grouped.loc[:, 'samples'] = example_sample_idx_grouped.idx.apply(len)
example_sample_idx_grouped.sort_values('samples', ascending=False, inplace=True)
example_sample_idx_grouped.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,idx,type,samples
chromosome,geneid,transcriptid,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
NC_000001.11,GeneID:81627,NM_001202423.2,"[(11302, 11366), (9446, 9510), (11059, 11123),...","[intron, intron-edge, intron-edge, mrna-edge, ...",5
NC_000001.11,GeneID:54823,NM_017673.7,"[(119990, 120054), (119363, 119427), (133368, ...","[intron, intron-edge, intron-edge]",3
NC_000001.11,GeneID:64754,XM_047428021.1,"[(52326, 52390), (0, 64), (0, 64)]","[intron, intron-edge, intron-edge]",3
NC_000001.11,GeneID:22955,XM_047449565.1,"[(170050, 170114), (167056, 167120), (171553, ...","[intron, intron-edge, intron-edge]",3
NC_000001.11,GeneID:57540,XM_011541832.2,"[(40411, 40475), (40317, 40381), (40532, 40596)]","[intron, intron-edge, intron-edge]",3


In [52]:
#| export
def get_multiple_training_sequences_wrapper(args: dict):
    "Receive a chunk of training indices, batch process them and write to disc."
    process_idx = next(iter(current_process()._identity), 0)
    index = args.get("index")
    assembly_path = args.get("assembly_path")
    epoch = args.get("epoch")
    chunk_size = args.get("chunk_size", 50)
    batch_size = args.get("batch_size", 10000)
    save = args.get("save", True)
    # Make sqlite3 connections
    genes_connection = sqlite3.connect(assembly_path / "genes.db")
    mrna_connection = sqlite3.connect(assembly_path / "mrna.db")
    write_path = assembly_path / "training/transcription/sequences" / f"epoch-{epoch}"
    if not write_path.exists():
        write_path.mkdir()
    # Split up index to chunks
    num_chunks = max(1, int(index.shape[0] / chunk_size))
    index = np.array_split(index, num_chunks)
    # Process chunks
    batch = []
    batch_row_counter = 0
    batch_counter = 1
    pbar = tqdm(
        total=len(index), ncols=80, 
        desc=f"Epoch-{epoch}", 
        position=process_idx)
    try:
        for chunk in index:
            # Get input, target sequences
            sequences = chunk.apply(
                lambda row: get_gene_transcript_samples(
                    row.chromosome,
                    row.geneid,
                    row.transcriptid,
                    row.idx,
                    assembly_path
                ), 
                axis=1)
            # Put into dataframe
            sequences = pd.concat(
                sequences.apply(
                    pd.DataFrame, columns=['input', 'target']
                ).values.tolist(), 
                axis=0)
            batch.append(sequences)
            batch_row_counter += sequences.shape[0]
            pbar.update(1)
            if (batch_row_counter >= batch_size) and save:
                # Write batch
                batch = pd.concat(batch, axis=0, ignore_index=True)
                batch_write_path = write_path / f"batch-{batch_counter}.parquet"
                batch.to_parquet(batch_write_path, index=False)
                batch = []
                batch_row_counter = 0
                batch_counter += 1
        # Write final batch
        if (len(batch) > 0) and save:
            batch = pd.concat(batch, axis=0, ignore_index=True)
            batch_write_path = write_path / f"batch-{batch_counter}.parquet"
            batch.to_parquet(batch_write_path, index=False)
    except Exception as e:
        raise e
    finally:
        genes_connection.close()
        mrna_connection.close()
        pbar.close()

In [53]:
#| hide
example_idx_grouped_batch = np.array_split(example_sample_idx_grouped, 2)[0].reset_index(drop=False)
example_idx_grouped_batch.head()

Unnamed: 0,chromosome,geneid,transcriptid,idx,type,samples
0,NC_000001.11,GeneID:81627,NM_001202423.2,"[(11302, 11366), (9446, 9510), (11059, 11123),...","[intron, intron-edge, intron-edge, mrna-edge, ...",5
1,NC_000001.11,GeneID:54823,NM_017673.7,"[(119990, 120054), (119363, 119427), (133368, ...","[intron, intron-edge, intron-edge]",3
2,NC_000001.11,GeneID:64754,XM_047428021.1,"[(52326, 52390), (0, 64), (0, 64)]","[intron, intron-edge, intron-edge]",3
3,NC_000001.11,GeneID:22955,XM_047449565.1,"[(170050, 170114), (167056, 167120), (171553, ...","[intron, intron-edge, intron-edge]",3
4,NC_000001.11,GeneID:57540,XM_011541832.2,"[(40411, 40475), (40317, 40381), (40532, 40596)]","[intron, intron-edge, intron-edge]",3


In [54]:
#| hide
example_batch_result = get_multiple_training_sequences_wrapper({
    "index": example_idx_grouped_batch,
    "assembly_path": latest_assembly_path,
    "epoch": 0 + 1,
    "save": False
})

Process-0-Generating:   0%|                               | 0/3 [00:00<?, ?it/s]

Process-0-Generating: 100%|███████████████████████| 3/3 [00:12<00:00,  4.08s/it]


In [55]:
#| export
def create_genes_index(assembly_path):
    con = sqlite3.connect(assembly_path / "genes.db")
    con.execute("CREATE UNIQUE INDEX IF NOT EXISTS genes_index ON genes(chromosome, geneid);")
    con.close()


def create_mrna_index(assembly_path):
    con = sqlite3.connect(assembly_path / "mrna.db")
    con.execute("CREATE UNIQUE INDEX IF NOT EXISTS mrna_index ON mrna(chromosome, geneid, transcriptid);")
    con.close()

In [56]:
#| hide
create_genes_index(latest_assembly_path)

In [57]:
#| hide
create_mrna_index(latest_assembly_path)

In [110]:
#| hide
import nbdev; nbdev.nbdev_export()