# Generating training instances from genomic dna, intron and mRNA positions.

> Preparing training instances.

## Goals
1. Construct the mRNA sequence
3. Generate training instances that;
    1. Include the "edge" of the intron with various shifting strategies
    2. Exclude the "edge" of the intron and get transcribed mRNA

In [1]:
#| default_exp training.transcription.sampling

In [None]:
#| hide
from nbdev.showdoc import *

## 0. Setup

In [2]:
#| export
from pathlib import Path
import numpy as np
import pandas as pd
from pandas.errors import SettingWithCopyWarning
import random
import warnings
import time
from datetime import timedelta
from tqdm import tqdm

warnings.filterwarnings("ignore", category=SettingWithCopyWarning)

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path
from llm_mito_scanner.data.transcription import get_chromosome_genes
from llm_mito_scanner.training.transcription.index import get_intron_locations
from llm_mito_scanner.training.transcription.generation import get_mrna_locations, get_gene

In [3]:
#| hide
config = load_config()

In [4]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
genes_path = latest_assembly_path / "genes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
intron_locations_path = transcription_data_path / "intron_positions"
for p in [genes_path, intron_locations_path]:
    if not p.exists():
        raise FileNotFoundError(f"This notebook requires the path {p.resolve()} to exist")

## Sampling

In [5]:
#| hide
example_chromosome = "NC_000001.11"

In [6]:
#| hide
intron_locations = get_intron_locations(
    intron_locations_path,
    chromosome=example_chromosome
)

In [7]:
#| hide
intron_locations.transcriptid.unique().size

12720

In [8]:
#| hide
mrna_length_df = intron_locations.drop_duplicates(subset=['chromosome', 'geneid', 'transcriptid'])[['mrna_start', 'mrna_end']]
total_dataset_length = (mrna_length_df.mrna_end - mrna_length_df.mrna_start).sum() * 24
training_sequence_length = 64
num_training_instances = total_dataset_length // training_sequence_length
'{:,}'.format(total_dataset_length), '{:,}'.format(num_training_instances)

('24,893,090,448', '388,954,538')

In [9]:
#| hide
training_hours = 6
training_minutes = training_hours * 60
training_seconds = training_minutes * 60
sequences_per_second = 2.5 * 32
training_instance_target = training_seconds * sequences_per_second
training_instance_target

1728000.0

In [10]:
#| hide
proportion_intron_edge = 0.25
proportion_intron = 0.25
proportion_mrna = 0.25
proportion_mrna_edge = 0.25

random_state = 42

sum([proportion_intron_edge, proportion_intron, proportion_mrna, proportion_mrna_edge])

1.0

In [11]:
#| hide
# each intron has a start edge and an end edge - lets make sure we sample both types
allowed_intron_edges = int(training_instance_target * proportion_intron_edge)
print(allowed_intron_edges)
unique_introns_edges = intron_locations.shape[0] * 2 # start and end edge
unique_introns_edges

432000


337270

In [12]:
#| export
def sample_intron_edges(
        locations: pd.DataFrame, n: int, 
        random_state: int = 42, offset: int = -32, length: int = 64) -> pd.DataFrame:
    "Get training instances where either the start of end of an intron is in the center of the sequence."
    start_n = int(n / 2)
    end_n = n - start_n
    replace = True if (start_n > locations.shape[0]) | (end_n > locations.shape[0]) else False
    starts = locations.sample(start_n, replace=replace, random_state=random_state)
    ends = locations.sample(end_n, replace=replace, random_state=random_state)
    frames = []
    for f, slice_origin in zip([starts, ends], ['intron_start', 'intron_end']):
        f_slice_start = (f[slice_origin] - f.mrna_start + offset).apply(lambda val: max(0, val))
        f.loc[:, 'mrna_len'] = f.mrna_end - f.mrna_start
        f.loc[:, 'start'] = f_slice_start
        f.loc[:, 'end'] = (f_slice_start + length)
        f.loc[:, 'end'] = f[['end', 'mrna_len']].min(axis=1)
        # Detect short sequences
        short_mask = (f.end - f.start) != length
        short_end_mask = short_mask & (f.end == f.mrna_len)
        short_start_mask = short_mask & (f.start == 0)
        if short_end_mask.sum() > 0:
            # Fix the samples with short ends
            f_short_ends = f.loc[short_end_mask, :]
            f_short_lengths = f_short_ends.end - f_short_ends.start
            f_short_adjustments = f_short_lengths - length
            f_short_ends.loc[:, 'start'] = f_short_ends.start + f_short_adjustments
            f.loc[short_end_mask, :] = f_short_ends
        if short_start_mask.sum() > 0:
            # Fix the samples with short starts
            f_short_starts = f.loc[short_start_mask, :]
            f_short_lengths = f_short_starts.end - f_short_starts.start
            f_short_adjustments = (f_short_lengths - length).mul(-1)
            f_short_starts.loc[:, 'end'] = f_short_starts.end + f_short_adjustments
            f.loc[short_start_mask, :] = f_short_starts
        f = f[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        frames.append(f)
    intron_edges = pd.concat(frames, axis=0)
    intron_edges.loc[:, 'type'] = 'intron-edge'
    return intron_edges

In [13]:
#| hide
example_sample_intron_edges = sample_intron_edges(intron_locations, allowed_intron_edges, random_state=random_state)
example_sample_intron_edges.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
121958,NC_000001.11,GeneID:5202,XM_011509624.4,7442,7506,intron-edge
146867,NC_000001.11,GeneID:93273,XM_011510162.3,0,64,intron-edge
131932,NC_000001.11,GeneID:7827,XM_017002298.2,16238,16302,intron-edge
103694,NC_000001.11,GeneID:84072,XM_047431826.1,9319,9383,intron-edge
119879,NC_000001.11,GeneID:115350,NM_052938.5,22221,22285,intron-edge


In [14]:
#| hide
example_sample_intron_edges_lengths = example_sample_intron_edges.end - example_sample_intron_edges.start
display(example_sample_intron_edges_lengths.value_counts())
bad_length_mask = example_sample_intron_edges_lengths != 64
bad_length_mask.value_counts()

64    432000
Name: count, dtype: int64

False    432000
Name: count, dtype: int64

In [15]:
#| hide
example_sample_intron_edges[bad_length_mask]

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type


In [16]:
#| hide
intron_lengths = intron_locations.intron_end - intron_locations.intron_start
intron_lengths.describe()

count    168635.000000
mean       5827.990850
std       17118.083674
min          33.000000
25%         543.000000
50%        1534.000000
75%        4281.000000
max      451448.000000
dtype: float64

In [17]:
#| hide
intron_lengths.min(), intron_lengths.max()

(33, 451448)

In [18]:
#| export
def sample_introns(
        locations: pd.DataFrame, n: int,
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    random.seed(random_state)
    "Get training instances where most of the tokens are <intron>."
    replace = False if n < locations.shape[0] else True
    intron_sample = locations.sample(n, replace=replace, random_state=random_state)
    # Handle sequences of varying sizes
    intron_sample.loc[:, 'intron_length'] = intron_sample.intron_end - intron_sample.intron_start
    intron_len_mask = intron_sample.intron_length <= length
    small_introns = intron_sample[intron_len_mask]
    large_introns = intron_sample[~intron_len_mask]
    sample_frames = []
    if small_introns.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_intron_slice_center = small_introns.intron_start
        # Shift slice center half the distance of the target sequence
        small_intron_slice_center = small_intron_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_introns.loc[:, 'start'] = small_intron_slice_center
        small_introns.loc[:, 'end'] = small_introns.start + length
        small_introns.loc[:, 'end'] = small_introns[['end', 'mrna_end']].min(axis=1)
        small_introns = small_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_introns.loc[:, 'type'] = 'intron-small'
        sample_frames.append(small_introns)
    if large_introns.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_introns.loc[:, 'slice_max'] = large_introns.intron_end - length
        large_introns.loc[:, 'slice_range'] = large_introns.apply(lambda row: range(row.intron_start, row.slice_max + 1, 1), axis=1)
        large_introns.loc[:, 'start'] = large_introns.slice_range.apply(lambda r: random.choice(r))
        large_introns.loc[:, 'end'] = large_introns.start + length
        large_introns = large_introns[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_introns.loc[:, 'type'] = 'intron'
        sample_frames.append(large_introns)
    # Randomly select a slice point within the identified range
    introns = pd.concat(sample_frames, axis=0)
    return introns

In [19]:
#| hide
example_sample_introns = sample_introns(intron_locations, int(training_instance_target * proportion_intron), random_state=random_state)
example_sample_introns.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
112692,NC_000001.11,GeneID:55974,XM_047425465.1,2215,2279,intron-small
4987,NC_000001.11,GeneID:124903828,XM_047436627.1,27191,27255,intron-small
112683,NC_000001.11,GeneID:55974,XM_006711453.3,2215,2279,intron-small
9316,NC_000001.11,GeneID:8863,NM_001289863.3,45629,45693,intron-small
9316,NC_000001.11,GeneID:8863,NM_001289863.3,45629,45693,intron-small


In [20]:
#| hide
(example_sample_introns.end - example_sample_introns.start).unique()

array([64])

In [21]:
#| hide
intron_locations.head(5)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome
0,GeneID:79501,NM_001005484.2,15,101,0,6167,NC_000001.11
1,GeneID:79501,NM_001005484.2,155,3618,0,6167,NC_000001.11
2,GeneID:112268260,XM_047436352.1,186,547,0,17102,NC_000001.11
3,GeneID:112268260,XM_047436352.1,1339,2365,0,17102,NC_000001.11
4,GeneID:112268260,XM_047436352.1,2467,8912,0,17102,NC_000001.11


In [22]:
#| hide
intron_locations_sample = intron_locations.sort_values(['chromosome', 'geneid', 'transcriptid']).head(50)
intron_locations_sample_grouped = intron_locations_sample.groupby(
        ['chromosome', 'geneid', 'transcriptid']
)
intron_locations_sample.loc[:, 'mrna_seq_start'] = (
    intron_locations_sample_grouped.intron_end.shift(1).fillna(0) + \
        intron_locations_sample.mrna_start
).astype(int)
intron_locations_sample.loc[:, 'mrna_seq_end'] = intron_locations_sample.intron_start + \
        intron_locations_sample.mrna_start
display(intron_locations_sample.head(3))
# Add last sequence of mRNA from last intron
intron_locations_sample_last_intron = intron_locations_sample_grouped.tail(1)
intron_locations_sample_last_intron.loc[:, 'mrna_seq_start'] = intron_locations_sample_last_intron.intron_end
intron_locations_sample_last_intron.loc[:, 'mrna_seq_end'] = intron_locations_sample_last_intron.mrna_end
display(intron_locations_sample_last_intron.head(3))
intron_locations_sample_mrna_locations = pd.concat(
    [
        intron_locations_sample,
        intron_locations_sample_last_intron
    ], axis=0, ignore_index=True
    ).sort_values(
        ['chromosome', 'geneid', 'transcriptid']).reset_index(drop=True)
intron_locations_sample_mrna_locations.head(3)

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166694,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
166695,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
166696,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
166706,GeneID:10000,NM_001206729.2,331261,343798,7495,351356,NC_000001.11,343798,351356
166602,GeneID:10000,NM_001370074.1,338756,345745,0,351356,NC_000001.11,345745,351356
166693,GeneID:10000,NM_005465.7,337920,344909,836,351356,NC_000001.11,344909,351356


Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end,chromosome,mrna_seq_start,mrna_seq_end
0,GeneID:10000,NM_001206729.2,41,302,7495,351356,NC_000001.11,7495,7536
1,GeneID:10000,NM_001206729.2,460,147868,7495,351356,NC_000001.11,7797,7955
2,GeneID:10000,NM_001206729.2,147994,178701,7495,351356,NC_000001.11,155363,155489


In [23]:
#| export
def sample_mrna(
        mrna_locations: pd.DataFrame, n: int, 
        random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get a sample or mrna sequence locations"
    replace = False if n < mrna_locations.shape[0] else True
    mrna_locations = mrna_locations.sample(n, replace=replace, random_state=random_state)
    # For small mrna sections, do the same thing we did with the introns
    # Handle sequences of varying sizes
    mrna_locations.loc[:, 'length'] = mrna_locations.end - mrna_locations.start
    mrna_len_mask = mrna_locations.length <= length
    small_sequences = mrna_locations[mrna_len_mask]
    large_sequences = mrna_locations[~mrna_len_mask]
    sample_frames = []
    if small_sequences.shape[0] > 0:
        # For those introns less than length, center, return the whole thing
        # Start at intron start
        small_sequences_slice_center = small_sequences.start
        # Shift slice center half the distance of the target sequence
        small_sequences_slice_center = small_sequences_slice_center.subtract(int(length / 2)).apply(lambda val: max(0, val))
        small_sequences.loc[:, 'start'] = small_sequences_slice_center
        small_sequences.loc[:, 'end'] = small_sequences.start + length
        small_sequences.loc[:, 'end'] = small_sequences[['end', 'mrna_end']].min(axis=1)
        small_sequences = small_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        small_sequences.loc[:, 'type'] = 'mrna-small'
        sample_frames.append(small_sequences)
    if large_sequences.shape[0] > 0:
        # For larger introns, identify the range we can slice to avoid edges
        large_sequences.loc[:, 'slice_max'] = large_sequences.end - length
        large_sequences.loc[:, 'slice_range'] = large_sequences.apply(lambda row: range(row.start, row.slice_max + 1, 1), axis=1)
        large_sequences.loc[:, 'start'] = large_sequences.slice_range.apply(lambda r: random.choice(r))
        large_sequences.loc[:, 'end'] = large_sequences.start + length
        large_sequences = large_sequences[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
        large_sequences.loc[:, 'type'] = 'mrna'
        sample_frames.append(large_sequences)
    # Randomly select a slice point within the identified range
    return pd.concat(sample_frames, axis=0, ignore_index=True)

In [24]:
#| hide
mrna_locations = get_mrna_locations(intron_locations)

In [25]:
#| hide
example_sample_mrna = sample_mrna(mrna_locations, int(proportion_mrna * training_instance_target))
example_sample_mrna.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:55740,XM_024448311.2,154687,154751,mrna-small
1,NC_000001.11,GeneID:405,XM_005245153.2,30260,30324,mrna-small
2,NC_000001.11,GeneID:5909,NM_001388231.1,51353,51417,mrna-small
3,NC_000001.11,GeneID:5909,NM_001388295.1,38637,38701,mrna-small
4,NC_000001.11,GeneID:353134,NM_178352.3,0,64,mrna-small


In [26]:
#| export
def sample_mrna_edges(locations: pd.DataFrame, n: int, random_state: int = 42, length: int = 64) -> pd.DataFrame:
    "Get the beginning and end of mrna"
    locations = locations.drop_duplicates(
        ['chromosome', 'geneid', 'transcriptid', 'mrna_start', 'mrna_end']
    ).drop(['intron_start', 'intron_end'], axis=1).reset_index(drop=True)
    n_start = int(n / 2)
    n_end = n - n_start
    replace = False if (n_start < locations.shape[0]) or (n_end < locations.shape[0]) else True
    mrna_starts = locations.sample(
        n_start, replace=replace, random_state=random_state
        ).rename({'mrna_start': 'start'}, axis=1)
    mrna_starts.loc[:, 'end'] = mrna_starts.start + length
    mrna_starts.loc[:, 'end'] = mrna_starts[['mrna_end', 'end']].min(axis=1)
    mrna_starts.drop(['mrna_end'], axis=1, inplace=True)
    mrna_ends = locations.sample(
        n_end, replace=replace, random_state=random_state).rename({'mrna_end': 'end'}, axis=1)
    mrna_ends.loc[:, 'start'] = mrna_ends.end - length
    mrna_ends.loc[:, 'start'] = mrna_ends[['mrna_start', 'start']].max(axis=1)
    mrna_ends.drop(['mrna_start'], axis=1, inplace=True)
    sample_edges = pd.concat([mrna_starts, mrna_ends], axis=0, ignore_index=True)
    sample_edges = sample_edges[['chromosome', 'geneid', 'transcriptid', 'start', 'end']]
    sample_edges.loc[:, 'type'] = 'mrna-edge'
    return sample_edges

In [27]:
#| hide
example_sample_mrna_edges = sample_mrna_edges(intron_locations, int(proportion_mrna_edge * training_instance_target))
example_sample_mrna_edges.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:9659,NM_001395320.1,36075,36139,mrna-edge
1,NC_000001.11,GeneID:5293,XM_047422571.1,24473,24537,mrna-edge
2,NC_000001.11,GeneID:11080,NM_007034.5,25764,25828,mrna-edge
3,NC_000001.11,GeneID:81573,NM_030816.5,0,64,mrna-edge
4,NC_000001.11,GeneID:116841,XM_017000232.2,463,527,mrna-edge


In [28]:
#| export
def sample_sequences_idx(
        n: int, 
        intron_locations: pd.DataFrame,
        mrna_locations: pd.DataFrame,
        intron_prop: float, intron_edge_prop: float, 
        mrna_prop: float, mrna_edge_prop: float,
        random_state: int = 42,
        length: int = 64) -> pd.DataFrame:
    "Build training dataset from intron locations."
    intron_sample = sample_introns(
        intron_locations, int(n * intron_prop), 
        random_state=random_state, length=length)
    intron_edge_sample = sample_intron_edges(
        intron_locations, int(n * intron_edge_prop), 
        random_state=random_state, length=length)
    mrna_sample = sample_mrna(mrna_locations, int(n * mrna_prop), 
        random_state=random_state, length=length)
    mrna_edge_sample = sample_mrna_edges(intron_locations, int(n * mrna_edge_prop),
        random_state=random_state, length=length)
    sample = pd.concat([
        intron_sample,
        intron_edge_sample,
        mrna_sample,
        mrna_edge_sample
    ], axis=0, ignore_index=True)
    return sample

In [29]:
#| hide
example_sample_sequence_idx = sample_sequences_idx(
    500,
    intron_locations,
    mrna_locations,
    intron_prop=proportion_intron,
    intron_edge_prop=proportion_intron_edge,
    mrna_prop=proportion_mrna,
    mrna_edge_prop=proportion_mrna_edge,
    random_state=random_state,
)
example_sample_sequence_idx['type'].value_counts()

type
intron         125
intron-edge    125
mrna-edge      125
mrna           114
mrna-small      11
Name: count, dtype: int64

In [30]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron


In [31]:
#| export
def make_mrna_file_index(mrna_path: Path) -> pd.DataFrame:
    ""
    parquet_files = list(mrna_path.glob("*/*.parquet"))
    index_frames = []
    for p in parquet_files:
        p_chromosome = p.parent.name
        p_frame = pd.read_parquet(p, columns=['geneid', 'transcriptid'])
        p_frame.loc[:, 'chromosome'] = p_chromosome
        p_frame.loc[:, 'path'] = p
        index_frames.append(p_frame)
    return pd.concat(index_frames, axis=0, ignore_index=True)


def get_mrna_file_index(transcription_path: Path) -> dict[tuple[str, str, str], Path]:
    "Get the file a particular mRNA resides in."
    mrna_index_path = transcription_path / "mrna_index.csv"
    if not mrna_index_path.exists():
        mrna_index = make_mrna_file_index(transcription_path / "mrna")
        mrna_index.to_csv(mrna_index_path)
    else:
        mrna_index = pd.read_csv(mrna_index_path)
    mrna_index.loc[:, 'path'] = mrna_index.path.apply(Path)
    return mrna_index.set_index(['chromosome', 'geneid', 'transcriptid']).path.to_dict()

In [32]:
#| hide
mrna_file_index = get_mrna_file_index(transcription_data_path)
len(mrna_file_index)

129111

In [33]:
#| hide
mrna_file_index.get((example_chromosome, "GeneID:79501", "NM_001005484.2"))

PosixPath('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/training/transcription/mrna/NC_000001.11/partition-001.parquet')

In [34]:
list(mrna_file_index.keys())[:2]

[('NC_000001.11', 'GeneID:79501', 'NM_001005484.2'),
 ('NC_000001.11', 'GeneID:112268260', 'XM_047436352.1')]

In [35]:
#| hide
example_sample_sequence_idx.loc[:, 'mrna_partition'] = example_sample_sequence_idx.apply(
    lambda row: mrna_file_index.get((row.chromosome, row.geneid, row.transcriptid)),
    axis=1)
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type,mrna_partition
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...


In [36]:
#| export
def make_gene_sequence_lookup(
        genes: pd.DataFrame) -> dict[tuple[str, str], str]:
    return genes.set_index(['chromosome', 'geneid']).sequence.to_dict()


def make_mrna_sequence_lookup(
        mrna: pd.DataFrame) -> dict[tuple[str, str, str], str]:
    return mrna.set_index(['chromosome', 'geneid', 'transcriptid']).mrna.to_dict()

In [37]:
#| hide
example_chromosome_genes = get_chromosome_genes(latest_assembly_path, example_chromosome)
example_chromosome_genes.loc[:, 'chromosome'] = example_chromosome
example_chromosome_genes = example_chromosome_genes[
    example_chromosome_genes.geneid.isin(
        example_sample_sequence_idx.geneid.unique())]
example_gene_lookup = make_gene_sequence_lookup(example_chromosome_genes)
len(example_gene_lookup)

229

In [38]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type,mrna_partition
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...


In [39]:
#| hide
example_sample_sequence_idx.mrna_partition.unique().size

26

In [40]:
#| export
def get_mrna_partitions(paths: list[Path], transcript_ids: list[str] = None) -> pd.DataFrame:
    frames = []
    for p in paths:
        p_frame = pd.read_parquet(p)
        if isinstance(transcript_ids, list):
            p_frame = p_frame[p_frame.transcriptid.isin(transcript_ids)]
        p_chromosome = p.parent.name
        p_frame.loc[:, 'chromosome'] = p_chromosome
        frames.append(p_frame)
    return pd.concat(frames, axis=0, ignore_index=True)

In [41]:
#| hide
example_sample_mrna_sequences = get_mrna_partitions(
    example_sample_sequence_idx.mrna_partition.unique().tolist(),
    example_sample_sequence_idx.transcriptid.unique().tolist()
)
example_sample_mrna_sequences.shape[0]

308

In [42]:
#| hide
example_sample_mrna_sequences.head()

Unnamed: 0,geneid,transcriptid,mrna_start,mrna_end,mrna,chromosome
0,GeneID:7827,XM_017002299.2,0,23346,"G,A,C,C,C,G,C,A,G,C,G,A,C,U,C,C,A,C,A,G,G,G,A,...",NC_000001.11
1,GeneID:163589,XM_047447753.1,219,99660,"<null>,<null>,<null>,<null>,<null>,<null>,<nul...",NC_000001.11
2,GeneID:3140,NM_001385162.1,596,28552,"<null>,<null>,<null>,<null>,<null>,<null>,<nul...",NC_000001.11
3,GeneID:777,XM_017002249.2,0,490386,"G,C,C,U,C,G,C,G,C,G,U,G,C,C,G,C,C,C,G,U,G,U,C,...",NC_000001.11
4,GeneID:777,XM_017002250.2,0,490386,"G,C,C,U,C,G,C,G,C,G,U,G,C,C,G,C,C,C,G,U,G,U,C,...",NC_000001.11


In [43]:
#| hide
example_mrna_lookup = make_mrna_sequence_lookup(example_sample_mrna_sequences)
len(example_mrna_lookup)

308

In [53]:
#| export
def get_training_sequences_with_idx(
        gene: str, mrna: str,
        start: int, end: int,
        ) -> tuple[str, str]:
    ""
    return list(gene[start: end]), mrna[start: end]

In [54]:
#| hide
example_idx_row = example_sample_sequence_idx.iloc[0, :]
example_idx_row

chromosome                                             NC_000001.11
geneid                                                 GeneID:83872
transcriptid                                         XM_011510038.4
start                                                        402874
end                                                          402938
type                                                         intron
mrna_partition    /mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
Name: 0, dtype: object

In [57]:
#| hide
example_training_sequence_gene, example_training_sequence_mrna = get_training_sequences_with_idx(
    example_gene_lookup.get((example_idx_row.chromosome, example_idx_row.geneid)),
    example_mrna_lookup.get((example_idx_row.chromosome, example_idx_row.geneid, example_idx_row.transcriptid)),
    example_idx_row.start, example_idx_row.end,
)
len(example_training_sequence_gene), len(example_training_sequence_mrna)

(64, 64)

In [58]:
#| hide
example_sample_sequence_idx.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type,mrna_partition
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...


In [59]:
#| hide
example_sample_sequence_idx.shape

(500, 7)

In [68]:
#| export
def get_training_sequences_with_idx_batches(
    index: pd.DataFrame,
    gene_lookup: dict[tuple[str,str], str],
    mrna_lookup: dict[tuple[str,str,str], str]) -> pd.DataFrame:
    sequences = index.apply(
        lambda row: get_training_sequences_with_idx(
            gene_lookup.get((row.chromosome, row.geneid)),
            mrna_lookup.get((row.chromosome, row.geneid, row.transcriptid)),
            row.start, row.end
        ), axis=1).values.tolist()
    sequences = pd.DataFrame(sequences, columns=['input', 'target'])
    return pd.concat(
        [
            index,
            sequences
        ], axis=1, ignore_index=False)


def get_index_sequences(
        index: pd.DataFrame,
        assembly_path: Path,
        chromosome: str = None) -> pd.DataFrame:
    # Make gene lookup
    gene_lookup = get_chromosome_genes(
        assembly_path, 
        chromosome=chromosome, 
        gene_ids=index.geneid.unique().tolist())
    gene_lookup = make_gene_sequence_lookup(gene_lookup)
    # Make mRNA lookup
    mrna_lookup = get_mrna_partitions(
        index.mrna_partition.unique().tolist(),
        index.transcriptid.unique().tolist())
    mrna_lookup = make_mrna_sequence_lookup(mrna_lookup)
    return get_training_sequences_with_idx_batches(
        index,
        gene_lookup,
        mrna_lookup)

In [64]:
#| hide
example_sample_sequence_idx.geneid.unique()[:5]

array(['GeneID:83872', 'GeneID:84288', 'GeneID:64754', 'GeneID:149483',
       'GeneID:57540'], dtype=object)

In [66]:
#| hide
example_sample_sequence_idx_sequences = get_index_sequences(
    example_sample_sequence_idx,
    latest_assembly_path,
    chromosome=example_chromosome
)
example_sample_sequence_idx_sequences.head()

Unnamed: 0,chromosome,geneid,transcriptid,start,end,type,mrna_partition,input,target
0,NC_000001.11,GeneID:83872,XM_011510038.4,402874,402938,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...,"[C, T, C, T, T, C, C, A, G, C, A, G, C, C, A, ...",">,<intron>,<intron>,<intron>,<intron>,<intron>..."
1,NC_000001.11,GeneID:84288,NM_001143943.1,7411,7475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...,"[A, T, G, G, A, G, A, C, A, G, A, A, A, G, T, ...","n>,<intron>,<intron>,<intron>,<intron>,<intron..."
2,NC_000001.11,GeneID:64754,XM_047428021.1,52326,52390,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...,"[G, G, A, G, G, A, C, A, T, T, G, A, T, T, T, ...","null>,<null>,<null>,<null>,<null>,<null>,<null..."
3,NC_000001.11,GeneID:149483,XM_011540820.2,1312,1376,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...,"[C, G, G, G, G, C, C, G, G, A, G, A, T, G, G, ...",",<intron>,<intron>,<intron>,<intron>,<intron>,..."
4,NC_000001.11,GeneID:57540,XM_011541832.2,40411,40475,intron,/mnt/e/Data/llm-mito-scanner-data/data/raw/ass...,"[G, G, A, G, T, A, A, C, A, C, T, T, G, A, C, ...","n>,<intron>,<intron>,<intron>,<intron>,<intron..."


In [67]:
#| hide
example_sample_sequence_idx_sequences.shape[0] / 11

45.45454545454545

In [78]:
#| export
def get_chromosome_idx_sequences(args: dict):
    chromosome = args.get("chromosome")
    index = args.get('index')
    assembly_path = args.get("assembly")
    pbar_position = args.get("position", 1)
    chunk_size = args.get("chunk_size", 500)
    write_size = args.get("write_size", 1000)
    save = args.get("save", False)
    write_path = assembly_path / "training/transcription/sequences" / chromosome
    if not write_path.exists():
        write_path.mkdir()
    num_chunks = max(1, index.shape[0] / chunk_size)
    index_chunks = []
    curr_chunk = None
    for p in index.mrna_partition.unique():
        p_index = index[index.mrna_partition == p]
        if p_index.shape[0] > chunk_size:
            num_chunks = max(1, p_index.shape[0] / chunk_size)
            p_index_chunks = np.array_split(p_index, num_chunks)
            for p_chunk in p_index_chunks:
                if curr_chunk is None:
                    curr_chunk = p_chunk
                else:
                    curr_chunk = pd.concat([curr_chunk, p_chunk], axis=0, ignore_index=True)
                    if curr_chunk.shape[0] >= chunk_size:
                        index_chunks.append(curr_chunk)
                        curr_chunk = None
            index_chunks.extend(p_index_chunks)
        else:
            if curr_chunk is None:
                curr_chunk = p_index
            else:
                curr_chunk = pd.concat([curr_chunk, p_index], axis=0, ignore_index=True)
                if curr_chunk.shape[0] >= chunk_size:
                    index_chunks.append(curr_chunk)
                    curr_chunk = None
    if curr_chunk is not None:
        index_chunks.append(curr_chunk)
    pbar = tqdm(
        total=len(index_chunks), 
        position=pbar_position, leave=False, ncols=80, desc=f"{chromosome}")
    batch_counter = 1
    sequences = []
    for index_chunk in index_chunks:
        chromosome_index_chunk_sequences = get_index_sequences(
            index_chunk,
            assembly_path,
            chromosome=chromosome
        )
        chromosome_index_chunk_sequences = chromosome_index_chunk_sequences[[
            "chromosome", "geneid", "transcriptid",
            "start", "end",
            "type",
            "input", "target"
        ]]
        sequences.append(chromosome_index_chunk_sequences)
        if save and sum([f.shape[0] for f in sequences]) >= write_size:
            write_path_chunk = write_path / f"partition-{str(batch_counter).zfill(3)}.parquet"
            pd.concat(sequences, axis=0, ignore_index=True).to_parquet(write_path_chunk, index=False)
            sequences = []
        batch_counter += 1
        pbar.update(1)
    if save and len(sequences) > 0:
        write_path_chunk = write_path / f"partition-{str(batch_counter).zfill(3)}.parquet"
        pd.concat(sequences, axis=0, ignore_index=True).to_parquet(write_path_chunk, index=False)
    pbar.close()

In [76]:
#| hide
get_chromosome_idx_sequences(
    {
        "chromosome": example_chromosome,
        "index": example_sample_sequence_idx,
        "assembly": latest_assembly_path,
        "mrna_file_index": mrna_file_index,
        "position": 0,
        "chunk_size": 250,
        "save": False
    }
)

NC_000001.11:   0%|                                       | 0/2 [00:00<?, ?it/s]

                                                                                

In [1]:
#| hide
import nbdev; nbdev.nbdev_export()