# Making the Reference Genome more easily usable

> Preparing genomic DNA for pre-training.

## Notebook Goals

Extract input and output sequences to train a transformer to transcribe DNA to mRNA.

## Stuff I've learned

1. Calling SeqFeature.extract is really expensive if the parent sequence is really long.
2. Training on all the transcript mRNA is very expensive

### Plan

1. For each chromosome file;
    1. Extract all gene sequences to csv e.g. `assembly_path/genes/{chromosome}.csv`
        1. Columns
            1. geneid
            2. Sequence
            3. Positive strand 5' chromosome position
            4. Negative strand 5' chromosome position
    1. Get mRNA feature details
        1. parent geneid
        2. transcript_id
        3. location parts
    2. Identify mRNA to gene relationships, de-dupe, write e.g. `assembly_path/relationships/mrna_to_gene/{chromosome}.csv`
2. For all mRNA to gene relationships
    1. Normalize mRNA positions to written gene
    2. Write normalized mRNA positions to disk

## 0. Setup

In [1]:
#| default_exp data.transcription

In [2]:
#| hide
from nbdev.showdoc import *

In [3]:
#| export
import warnings
warnings.simplefilter("ignore")
from pathlib import Path
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from Bio.SeqFeature import SeqFeature, SimpleLocation, Seq
from tqdm.auto import tqdm
import pandas as pd
import typing
from tqdm import tqdm
from multiprocessing import current_process
from copy import deepcopy

tqdm.pandas()

from llm_mito_scanner.data.download import load_config, \
    get_latest_assembly_path, get_genomic_genbank_path

In [4]:
#| hide
config = load_config()

In [5]:
#| hide
data_path = Path(config.get("data_path"))
data_raw_path = data_path / "raw"
assemblies_path = data_raw_path / "assemblies"
latest_assembly_path = get_latest_assembly_path(assemblies_path)
genomic_genbank_path = get_genomic_genbank_path(latest_assembly_path)
chromosomes_path = latest_assembly_path / "chromosomes"
training_data_path = latest_assembly_path / "training"
transcription_data_path = training_data_path / "transcription"
if not transcription_data_path.exists():
    transcription_data_path.mkdir(parents=True)

In [6]:
#| hide
example_chromosome_path = next(chromosomes_path.glob("*.gb"))
example_chromosome_path

Path('/mnt/e/Data/llm-mito-scanner-data/data/raw/assemblies/GCF_000001405.40_GRCh38.p14/chromosomes/NC_000001.11.gb')

In [7]:
#| hide
with example_chromosome_path.open("rt") as f:
    example_chromosome = next(SeqIO.parse(f, "genbank"), None)
example_chromosome

SeqRecord(seq=Seq('NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN'), id='NC_000001.11', name='NC_000001', description='Homo sapiens chromosome 1, GRCh38.p14 Primary Assembly', dbxrefs=['BioProject:PRJNA168', 'Assembly:GCF_000001405.40'])

In [8]:
#| hide
len(example_chromosome.features)

51489

In [9]:
#| export
def filter_chromosome_features_by_type(
        chromosome_record: SeqRecord, 
        feature_type: str,
        ) -> list[tuple[int, SeqFeature]]:
    return [(i, f) for i, f in enumerate(chromosome_record.features) if f.type == feature_type]

In [10]:
#| export
def get_feature_qualifiers(feature: SeqFeature) -> typing.Dict[str, typing.Any]:
    return getattr(feature, "qualifiers", None)


def get_feature_dbxrefs(feature: SeqFeature) -> str | None:
    feature_qualifiers = get_feature_qualifiers(feature)
    if feature_qualifiers is None:
        return None
    feature_dbxrefs = feature_qualifiers.get("db_xref", None)
    return feature_dbxrefs


def get_feature_dbxref_xref(feature: SeqFeature, prefix: str) -> str | None:
    feature_dbxrefs = get_feature_dbxrefs(feature)
    if feature_dbxrefs is None:
        return None
    tag_db_xref = next(iter([x for x in feature_dbxrefs if x.startswith(prefix)]), None)
    return tag_db_xref


def get_feature_geneid(feature: SeqFeature) -> str | None:
    return get_feature_dbxref_xref(feature, "GeneID")


def get_feature_transcript_id(feature: SeqFeature) -> str | None:
    return next(iter(get_feature_qualifiers(feature).get("transcript_id", [])), None)

## 1. Process chromosome files

### 1.1 Extract all gene sequences to csv e.g. `assembly_path/genes/{chromosome}.csv`
Columns
1. geneid
2. Sequence
3. Positive strand 5' chromosome position
4. Negative strand 5' chromosome position

In [11]:
#| hide
example_chromosome_genes = filter_chromosome_features_by_type(example_chromosome, "gene")
len(example_chromosome_genes)

5501

In [12]:
#| hide
example_chromosome_gene_positive = example_chromosome_genes[0][1]
example_chromosome_gene_negative = example_chromosome_genes[1][1]

In [13]:
#| hide
example_chromosome_gene_positive.location

SimpleLocation(ExactPosition(11873), ExactPosition(14409), strand=1)

In [14]:
#| hide
# Get pos and neg strand positions
int(example_chromosome_gene_positive.location.start)

11873

In [15]:
#| hide
example_chromosome_gene_negative.location.end

ExactPosition(29370)

In [16]:
#| hide
int(example_chromosome_gene_negative.location.start)

14361

In [17]:
#| export
def get_chromosome_gene_info(
        chromosome_record: SeqRecord,
        pbar_position: int = 0
        ) -> pd.DataFrame:
    chromosome_genes = [t[1] for t in filter_chromosome_features_by_type(chromosome_record, "gene")]
    chromosome_gene_ids = list(map(get_feature_geneid, chromosome_genes))
    chromosome_gene_sequences = list(map(
        lambda seq_feature: str(seq_feature.extract(chromosome_record).seq),
        tqdm(chromosome_genes, leave=False, ncols=80, position=pbar_position, desc=f"Process-{pbar_position}")
    ))
    pos_strand_positions = list(map(lambda f: f.location.start, chromosome_genes))
    neg_strand_positions = list(map(lambda f: f.location.end, chromosome_genes))
    gene_df = pd.DataFrame(
        chromosome_gene_ids, columns=['geneid']
    )
    gene_df.loc[:, 'sequence'] = chromosome_gene_sequences
    gene_df.loc[:, 'pos_strand_position'] = pos_strand_positions
    gene_df.loc[:, 'neg_strand_position'] = neg_strand_positions
    return gene_df
    

In [18]:
#| hide
example_chromosome_gene_info_path = latest_assembly_path / "genes" / f"{example_chromosome_path.stem}.csv"
if example_chromosome_gene_info_path.exists():
    example_chromosome_gene_info = pd.read_csv(example_chromosome_gene_info_path)
else:
    example_chromosome_gene_info = get_chromosome_gene_info(example_chromosome)


In [19]:
#| hide
example_chromosome_gene_info.head()

Unnamed: 0,geneid,sequence,pos_strand_position,neg_strand_position
0,GeneID:100287102,CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTA...,11873,14409
1,GeneID:653635,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...,14361,29370
2,GeneID:102466751,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...,17368,17436
3,GeneID:107985730,TGCCCTCCAGCCCTACGCCTTGACCCGCTTTCCTGCGTCTCTCAGC...,29773,35418
4,GeneID:100302278,GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCG...,30365,30503


In [20]:
#| export
def write_chromosome_gene_info(assembly_path: Path, chromosome_tag: str, frame: pd.DataFrame):
    genes_path = assembly_path / "genes"
    if not genes_path.exists():
        genes_path.mkdir()
    gene_info_path = genes_path / f"{chromosome_tag}.csv"
    frame.to_csv(gene_info_path, index=False)

In [21]:
#| hide
if not example_chromosome_gene_info_path.exists():
    write_chromosome_gene_info(latest_assembly_path, example_chromosome_path.stem, example_chromosome_gene_info)

In [22]:
#| export
def read_all_chromosome_gene_info(assembly_path: Path, limit: int = None) -> pd.DataFrame:
    gene_info_files = list((assembly_path / "genes").glob("*.csv"))
    if limit is not None:
        gene_info_files = gene_info_files[:limit]
    return pd.concat([pd.read_csv(p) for p in gene_info_files], axis=0, ignore_index=True)

In [23]:
#| hide
example_chromosome_gene_info_frame = read_all_chromosome_gene_info(latest_assembly_path, 3)
example_chromosome_gene_info_frame.head()

Unnamed: 0,geneid,sequence,pos_strand_position,neg_strand_position
0,GeneID:100287102,CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTA...,11873,14409
1,GeneID:653635,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...,14361,29370
2,GeneID:102466751,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...,17368,17436
3,GeneID:107985730,TGCCCTCCAGCCCTACGCCTTGACCCGCTTTCCTGCGTCTCTCAGC...,29773,35418
4,GeneID:100302278,GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCG...,30365,30503


1. feature index
2. transcript_id
3. location parts

### 1.2 Identify mRNA parent gene relationships

In [24]:
#| export
def get_gene_and_mrna_relationships(
        chromosome: SeqRecord,
        ) -> pd.DataFrame:
    ""
    mrna_features = filter_chromosome_features_by_type(chromosome, "mRNA")
    relationships = set()
    for idx, mrna in mrna_features:
        mrna_transcript_id = get_feature_transcript_id(mrna)
        mrna_gene_id = get_feature_geneid(mrna)
        if mrna_gene_id is not None and mrna_transcript_id is not None:
            relationship_tuple = (mrna_gene_id, mrna_transcript_id, idx)
            relationships.add(relationship_tuple)

    chromosome_relationships_df = pd.DataFrame(
        relationships, 
        columns=['geneid', 'transcript_id', 'transcript_feature_idx']
    ).drop_duplicates(subset=["geneid", "transcript_id"]).sort_values("transcript_feature_idx", ascending=True)

    gene_idx = pd.DataFrame(
        [(idx, get_feature_geneid(f)) for idx, f in filter_chromosome_features_by_type(chromosome, "gene")],
        columns=["gene_feature_idx", "geneid"]
    )

    chromosome_relationships_df = chromosome_relationships_df.merge(
        gene_idx, on=['geneid']
    ).drop_duplicates(
        subset=['geneid', 'transcript_id']
        ).sort_values("gene_feature_idx", ascending=True)
    return chromosome_relationships_df

In [25]:
#| hide
example_chromosome_relationships = get_gene_and_mrna_relationships(
    example_chromosome,
)

In [26]:
#| hide
print(example_chromosome_relationships.shape[0])
example_chromosome_relationships.head()

12841


Unnamed: 0,geneid,transcript_id,transcript_feature_idx,gene_feature_idx
0,GeneID:79501,NM_001005484.2,22,21
1,GeneID:112268260,XM_047436352.1,80,79
2,GeneID:729759,NM_001005221.2,85,84
3,GeneID:105378947,XM_011542538.1,96,95
4,GeneID:81399,XM_024449992.2,126,125


### 1.3 Write relationships

In [27]:
#| export
def write_mrna_gene_relationships(relationships: pd.DataFrame, chromosome: str, assembly_path: Path):
    relationship_path = assembly_path / "relationships"
    mrna_to_gene_path = relationship_path / "mrna_to_gene"
    if not mrna_to_gene_path.exists():
        mrna_to_gene_path.mkdir(parents=True)
    chromosome_relationship_path = mrna_to_gene_path / f"{chromosome}.csv"
    relationships.to_csv(chromosome_relationship_path, index=False)

In [28]:
#| hide
example_chromosome_relationships_path = latest_assembly_path / "relationships"/ "mrna_to_gene" / f"{example_chromosome_path.stem}.csv"
if not example_chromosome_relationships_path.exists():
    write_mrna_gene_relationships(example_chromosome_relationships, example_chromosome_path.stem, latest_assembly_path)

## 2. Process mRNA to gene relationships

For all mRNA to gene relationships
1. Normalize mRNA positions to written gene
2. Write normalized mRNA positions to disk

In [29]:
#| hide
example_chromosome_genes_path = latest_assembly_path / "genes" / f"{example_chromosome_path.stem}.csv"
example_chromosome_genes = pd.read_csv(example_chromosome_genes_path)
example_chromosome_mrna = filter_chromosome_features_by_type(example_chromosome, "mRNA")

In [30]:
#| hide
example_chromosome_genes.head()

Unnamed: 0,geneid,sequence,pos_strand_position,neg_strand_position
0,GeneID:100287102,CTTGCCGTCAGCCTTTTCTTTGACCTCTTCTTTCTGTTCATGTGTA...,11873,14409
1,GeneID:653635,TCCGGCAGAGCGGAAGCGGCGGCGGGAGCTTCCGGGAGGGCGGCTC...,14361,29370
2,GeneID:102466751,TGTGGGAGAGGAACATGGGCTCAGGACAGCGGGTGTCAGCTTGCCT...,17368,17436
3,GeneID:107985730,TGCCCTCCAGCCCTACGCCTTGACCCGCTTTCCTGCGTCTCTCAGC...,29773,35418
4,GeneID:100302278,GGATGCCCAGCTAGTTTGAATTTTAGATAAACAACGAATAATTTCG...,30365,30503


In [31]:
#| hide
example_chromosome_mrna_sample = pd.Series(example_chromosome_mrna).sample(10, random_state=42).values.tolist()
example_chromosome_mrna_sample[:5]

[(34091,
  SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(155978066), ExactPosition(155978175), strand=-1), SimpleLocation(ExactPosition(155969155), ExactPosition(155969300), strand=-1), SimpleLocation(ExactPosition(155966819), ExactPosition(155966887), strand=-1), SimpleLocation(ExactPosition(155966415), ExactPosition(155966479), strand=-1), SimpleLocation(ExactPosition(155965630), ExactPosition(155965760), strand=-1), SimpleLocation(ExactPosition(155965302), ExactPosition(155965412), strand=-1), SimpleLocation(ExactPosition(155964987), ExactPosition(155965131), strand=-1), SimpleLocation(ExactPosition(155962932), ExactPosition(155963183), strand=-1), SimpleLocation(ExactPosition(155962592), ExactPosition(155962718), strand=-1), SimpleLocation(ExactPosition(155962104), ExactPosition(155962222), strand=-1), SimpleLocation(ExactPosition(155961660), ExactPosition(155961909), strand=-1), SimpleLocation(ExactPosition(155958319), ExactPosition(155958396), strand=-1), SimpleLocati

In [32]:
#| hide
example_chromosome_mrna_sample_transcriptids = list(map(
    lambda m: get_feature_transcript_id(
        m[1]
    ),
    example_chromosome_mrna_sample
))

In [33]:
#| export
def get_mrna_gene_id(mrna_tup: tuple[int, SeqFeature], relationships: pd.DataFrame):
    idx, mrna = mrna_tup
    mrna_transcript_id = get_feature_transcript_id(mrna)
    mrna_gene_id = relationships[relationships.transcript_id == mrna_transcript_id]
    if mrna_gene_id.shape[0] == 0:
        return None
    return mrna_gene_id.iloc[0, :].geneid

In [34]:
#| hide
example_chromosome_mrna_sample_geneids = list(map(
    lambda m: get_mrna_gene_id(
        m, example_chromosome_relationships
    ), 
    example_chromosome_mrna_sample
))

In [35]:
#| export
def get_gene_seq_record(gene_id: str, genes: pd.DataFrame) -> tuple[tuple[int, int], SeqRecord]:
    ""
    gene_id_row = genes[genes.geneid == gene_id]
    if gene_id_row.shape[0] == 0:
        return None
    gene_id_row = gene_id_row.iloc[0, :]
    gene_id_seqrecord = SeqRecord(Seq(gene_id_row.sequence))
    return gene_id_row.pos_strand_position, gene_id_row.neg_strand_position, gene_id_seqrecord

In [36]:
#| hide
example_chromosome_mrna_sample_gene_records = list(map(
    lambda gene_id: get_gene_seq_record(gene_id, example_chromosome_genes),
    example_chromosome_mrna_sample_geneids
))

In [37]:
#| hide
example_chromosome_mrna_sample_gene_records[0]

(155946853,
 155979617,
 SeqRecord(seq=Seq('AGAATCAGGCAGGACCCTGCTAACCGAAGGCTCTGTCCCAGGCCTCTCTGTGCT...CCA'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]))

In [38]:
#| hide
example_mrna = example_chromosome_mrna_sample[0]
print(example_mrna)

(34091, SeqFeature(CompoundLocation([SimpleLocation(ExactPosition(155978066), ExactPosition(155978175), strand=-1), SimpleLocation(ExactPosition(155969155), ExactPosition(155969300), strand=-1), SimpleLocation(ExactPosition(155966819), ExactPosition(155966887), strand=-1), SimpleLocation(ExactPosition(155966415), ExactPosition(155966479), strand=-1), SimpleLocation(ExactPosition(155965630), ExactPosition(155965760), strand=-1), SimpleLocation(ExactPosition(155965302), ExactPosition(155965412), strand=-1), SimpleLocation(ExactPosition(155964987), ExactPosition(155965131), strand=-1), SimpleLocation(ExactPosition(155962932), ExactPosition(155963183), strand=-1), SimpleLocation(ExactPosition(155962592), ExactPosition(155962718), strand=-1), SimpleLocation(ExactPosition(155962104), ExactPosition(155962222), strand=-1), SimpleLocation(ExactPosition(155961660), ExactPosition(155961909), strand=-1), SimpleLocation(ExactPosition(155958319), ExactPosition(155958396), strand=-1), SimpleLocation(

In [39]:
#| hide
example_gene_record = example_chromosome_mrna_sample_gene_records[0]
print(example_gene_record)

(155946853, 155979617, SeqRecord(seq=Seq('AGAATCAGGCAGGACCCTGCTAACCGAAGGCTCTGTCCCAGGCCTCTCTGTGCT...CCA'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]))


In [40]:
#| hide
# Validate gene
example_gene_start_seq = "AGAATCAGGCAGGACCCTGCTAACCGAAGGCTCTGTCCCAGGCCTCTCTGTGCTTCTGGTTTCTCCAGAG"
example_gene_end_seq = "TCGAAGAATTTTCTTCTTGCCAATTTTGTTGTTTAAAAAAAAAATTCAGGGAAAATTAAAAACCTGGAACTCCA"
example_gene_record[-1].seq.startswith(example_gene_start_seq), example_gene_record[-1].seq.endswith(example_gene_end_seq)

(True, True)

In [41]:
#| export
def normalize_mrna_positions(
        mrna_tup: tuple[int, SeqFeature], 
        gene_record_tup: tuple[tuple[int, int], SeqRecord],
        debug: bool = False
        ) -> list[tuple[int, int]]:
    idx, mrna = mrna_tup
    pos_strand_position, neg_strand_position, gene_record = gene_record_tup
    is_neg_strand = mrna.location.parts[0].strand == -1
    if not is_neg_strand:
        norm_positions = [p - pos_strand_position for p in mrna.location.parts]
    else:
        norm_positions = [p - neg_strand_position for p in mrna.location.parts]
        inverted_positions = []
        for p in norm_positions:
            inverted_p = SimpleLocation(abs(p.end),abs(p.start), strand=1)
            inverted_positions.append(inverted_p)
        norm_positions = inverted_positions
    norm_position_ints = [(int(p.start), int(p.end)) for p in norm_positions]
    return norm_position_ints

In [42]:
#| hide
example_mrna_norm_positions = normalize_mrna_positions(example_mrna, example_gene_record, debug=True)
example_mrna_norm_positions[:3]

[(1442, 1551), (10317, 10462), (12730, 12798)]

In [43]:
#| export
def get_mrna_bookends(
        mrna_tup: tuple[int, SeqFeature], 
        gene_record_tup: tuple[tuple[int, int], SeqRecord]) -> tuple[int, int]:
    norm_mrna_positions = normalize_mrna_positions(mrna_tup, gene_record_tup)
    start = norm_mrna_positions[0][0]
    end = norm_mrna_positions[-1][-1]
    return start, end

In [44]:
#| hide
example_mrna_bookends = get_mrna_bookends(example_mrna, example_gene_record)
example_mrna_bookends

(1442, 32764)

In [45]:
#| hide
example_chromosome_mrna_sample_bookends = list(map(
    lambda tup: get_mrna_bookends(*tup),
    zip(example_chromosome_mrna_sample, example_chromosome_mrna_sample_gene_records)
))

In [46]:
#| hide
example_chromosome_mrna_sample_bookends[:2]

[(1442, 32764), (198029, 634457)]

In [47]:
#| export
def extract_sequence_with_positions(positions: list[tuple[int, int]], sequence: str):
    sequence_extracted_list = []
    for start, end in positions:
        position_sequence = sequence[start: end]
        sequence_extracted_list.append(position_sequence)
    return "".join(sequence_extracted_list)

In [48]:
#| hide
example_mrna_norm_seq = extract_sequence_with_positions(example_mrna_norm_positions, str(example_gene_record[-1].seq))

In [49]:
#| hide
len(example_mrna_norm_seq)

4125

In [50]:
#| hide
mrna_start_seq = "AGACGCGACGGTGCTGGGATCCCGGGAGGGAGCGGAGCGGACCTGGGCTTGGTCGCCTCCAAGCCGGCGG"
mrna_end_seq = "TTTCTTCTTGCCAATTTTGTTGTTTAAAAAAAAAATTCAGGGAAAATTAAAAACCTGGAACTCCA"

In [51]:
#| hide
mrna_sequence_source = example_mrna[1].extract(example_chromosome)

In [52]:
#| hide
example_mrna[1].location.parts[:3]

[SimpleLocation(ExactPosition(155978066), ExactPosition(155978175), strand=-1),
 SimpleLocation(ExactPosition(155969155), ExactPosition(155969300), strand=-1),
 SimpleLocation(ExactPosition(155966819), ExactPosition(155966887), strand=-1)]

In [53]:
#| hide
mrna_sequence_source.seq == example_mrna_norm_seq

True

In [54]:
#| hide
mrna_sequence_source.seq.startswith(mrna_start_seq), example_mrna_norm_seq.startswith(mrna_start_seq)
if not example_mrna_norm_seq.startswith(mrna_start_seq):
    print("Normalized mRNA sequence start doesn't match")
    print(example_mrna_norm_seq[:len(mrna_start_seq)])

In [55]:
#| hide
mrna_end_seq = "TTTCTTCTTGCCAATTTTGTTGTTTAAAAAAAAAATTCAGGGAAAATTAAAAACCTGGAACTCCA"
mrna_sequence_source.seq.endswith(mrna_end_seq), example_mrna_norm_seq.endswith(mrna_end_seq)
if not example_mrna_norm_seq.endswith(mrna_end_seq):
    print("Normalized mRNA sequence end doesn't match")
    print(example_mrna_norm_seq[-len(mrna_end_seq):])

In [56]:
#| export
def get_mrna_intron_positions(
        mrna_tup: tuple[int, SeqFeature],
        gene_record_tup: tuple[tuple[int, int], SeqRecord],
) -> list[tuple[int, int]]:
    "Get intron positions to replace in the input sequence."
    mrna_norm_positions = normalize_mrna_positions(mrna_tup, gene_record_tup)
    # Get the starting gene location of the mrna
    mrna_start, mrna_end = get_mrna_bookends(mrna_tup, gene_record_tup)
    # Get the first end of the spliced transcript
    prev_end = None
    intron_positions = []
    for pos_start, pos_end in mrna_norm_positions:
        if prev_end is None:
            prev_end = pos_end
            continue
        intron = [prev_end, pos_start]
        intron_positions.append(intron)
        prev_end = int(pos_end)
    intron_positions = [(p[0] - mrna_start, p[1] - mrna_start) for p in intron_positions]
    return intron_positions

In [57]:
#| hide
example_mrna_intron_positions = get_mrna_intron_positions(example_mrna, example_gene_record)
example_mrna_intron_positions[:5], example_mrna_intron_positions[-1]

([(109, 8875), (9020, 11288), (11356, 11696), (11760, 12415), (12545, 12763)],
 (27907, 30160))

In [58]:
#| hide
example_intron_positions = list(
    map(
        lambda tup: get_mrna_intron_positions(tup[0], tup[1]),
        zip(example_chromosome_mrna_sample, example_chromosome_mrna_sample_gene_records)))

In [59]:
#| hide
example_chromosome_mrna_sample_gene_records[0]

(155946853,
 155979617,
 SeqRecord(seq=Seq('AGAATCAGGCAGGACCCTGCTAACCGAAGGCTCTGTCCCAGGCCTCTCTGTGCT...CCA'), id='<unknown id>', name='<unknown name>', description='<unknown description>', dbxrefs=[]))

In [60]:
#| hide
training_data_path = transcription_data_path / "introns"

In [61]:
#| export
def make_intron_position_dataframe(
        gene_ids: list[str], 
        transcript_ids: list[str], 
        mrna_bookends: list[tuple[int, int]],
        intron_positions: list[tuple[int, int]]):
    frame = pd.DataFrame(
        gene_ids,
        columns=['geneid']
    )
    frame.loc[:, 'transcriptid'] = transcript_ids
    frame.loc[:, 'intron_position'] = intron_positions
    frame.loc[:, 'bookends'] = mrna_bookends
    frame = frame.explode('intron_position').reset_index(drop=True)
    frame.dropna(subset=['intron_position'], inplace=True)
    if frame.shape[0] > 0:
        frame.loc[:, 'intron_start'] = frame.intron_position.str[0]
        frame.loc[:, 'intron_end'] = frame.intron_position.str[1]
        frame.drop('intron_position', axis=1, inplace=True)
        frame.loc[:, 'mrna_start'] = frame.bookends.str[0]
        frame.loc[:, 'mrna_end'] = frame.bookends.str[1]
        frame.drop('bookends', axis=1, inplace=True)
    else:
        frame = None
    return frame

In [62]:
#| hide
example_intron_info_df = make_intron_position_dataframe(
    example_chromosome_mrna_sample_geneids,
    example_chromosome_mrna_sample_transcriptids,
    example_chromosome_mrna_sample_bookends,
    example_intron_positions
)
example_intron_info_df.head()

Unnamed: 0,geneid,transcriptid,intron_start,intron_end,mrna_start,mrna_end
0,GeneID:9181,NM_001350111.2,109,8875,1442,32764
1,GeneID:9181,NM_001350111.2,9020,11288,1442,32764
2,GeneID:9181,NM_001350111.2,11356,11696,1442,32764
3,GeneID:9181,NM_001350111.2,11760,12415,1442,32764
4,GeneID:9181,NM_001350111.2,12545,12763,1442,32764


In [63]:
#| hide
example_intron_info_df.shape

(117, 6)

In [64]:
#| hide
example_intron_info_df[['geneid', 'transcriptid']].drop_duplicates().shape[0]

10

In [65]:
#| hide
import nbdev; nbdev.nbdev_export()