# Getting training data for translation

> "Build our training data using the annotations we brokedown and the genome sequences we've stored."

In [None]:
#| default_exp features.annotations

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from Bio import SeqIO, Entrez
import re
import json
from multiprocessing import Pool
import os
import sys

pd.options.mode.chained_assignment = None



In [None]:
#| hide
from yaml import safe_load

tqdm.pandas(ncols=80, leave=False)

with open("../config.yml") as f:
    config = safe_load(f)

In [None]:
#| hide
data_path = Path(config.get("data_path"))
annotations_path = data_path / "annotations"
genome_path = data_path / "genome"
sequences_path = data_path / "entrez_genes_sequences"
if not sequences_path.exists():
    sequences_path.mkdir()

Entrez.email = config.get("email")
Entrez.api_key = config.get("nih_api_key")

## Get gene sequences

In [None]:
#| hide
test_annotation_file = annotations_path / "NC_000014.9.csv"

In [None]:
#| hide
test_annotations = pd.read_csv(test_annotation_file)
test_annotations.sample(5)

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
55106,NC_000014.9,BestRefSeq,exon,61529075,61529213,,+,,ID=exon-NM_006255.5-11;Parent=rna-NM_006255.5;...
59581,NC_000014.9,Gnomon,exon,63961525,63961625,,+,,ID=exon-XM_005267458.2-9;Parent=rna-XM_0052674...
1766,NC_000014.9,Gnomon,exon,20653312,20653490,,+,,ID=exon-XR_007064059.1-1;Parent=rna-XR_0070640...
30406,NC_000014.9,BestRefSeq,exon,39308449,39308587,,+,,ID=exon-NM_001247990.2-13;Parent=rna-NM_001247...
113617,NC_000014.9,BestRefSeq,exon,99737722,99737860,,+,,ID=exon-NM_001375411.1-1;Parent=rna-NM_0013754...


In [None]:
#| hide
test_annotations.iloc[0, :]

seqid                                               NC_000014.9
source                                                   RefSeq
type                                                     region
start                                                         1
end                                                   107043718
score                                                       NaN
strand                                                        +
phase                                                       NaN
attributes    ID=NC_000014.9:1..107043718;Dbxref=taxon:9606;...
Name: 0, dtype: object

In [None]:
#| export
def parse_annotation_value(annotation_value: str):
    """
    Handle multiple values in the attribute.
    Return value if list or dict if key-value pair
    """
    annotation_split = annotation_value.split(":")
    if len(annotation_split) == 1:
        return annotation_value
    return {annotation_split[0]: annotation_split[1]}


def parse_annotation_attribute_values(attribute_values: str) -> object:
    """
    Detect if an attribute value is really a dict or list.
    """
    value_list = attribute_values.split(",")
    if len(value_list) == 1:
        return attribute_values
    parsed_values = [parse_annotation_value(value) for value in value_list]
    # Dict within a list
    if ":" in value_list[0]:
        attribute_values = {}
        for parsed_value in parsed_values:
            attribute_values.update(parsed_value)
    # Just a list within a list
    else:
        attribute_values = parsed_values
    return attribute_values


def parse_entrez_gff3_annotation_attributes(attributes_str: str):
    """
    Convert attribute string to parsed attribute dict.
    
    Split on ';' to get attributes.
    Values can also be nested as a dict or list.
    """
    attr_list = attributes_str.split(";")
    attr_dict = {
        k: parse_annotation_attribute_values(v) for k, v in [attr.split("=") for attr in attr_list]
    }
    return attr_dict

In [None]:
#| hide
test_annotations.loc[:, 'attributes_dict'] = test_annotations.attributes.apply(parse_entrez_gff3_annotation_attributes)

In [None]:
#| hide
test_row = 0
display(test_annotations.iloc[test_row, :-2])
test_annotations.iloc[test_row, -1]

seqid     NC_000014.9
source         RefSeq
type           region
start               1
end         107043718
score             NaN
strand              +
phase             NaN
Name: 0, dtype: object

{'ID': 'NC_000014.9:1..107043718',
 'Dbxref': 'taxon:9606',
 'Name': '14',
 'chromosome': '14',
 'gbkey': 'Src',
 'genome': 'chromosome',
 'mol_type': 'genomic DNA'}

### Converting annotations to SeqRecords for training.

Genes are made up of multiple elements.

- Genes
    - Entire DNA sequence encoding the information for a protein.
- Introns
    - Sections of DNA ignored during transcription.
- Exons
    - Sections of DNA included in the transcription product.
- UTRs (Untranslated Retions)
    - Sections of mRNA ignored during translation.
- CDS (Coding Sequence)
    - Sections of mRNA that are translated to protein.

    
The annotations we have include all this information. Lets extract a SeqRecord object using the annotations that has all this information.

Transformation plan:

1. Filter gff features to those belonging to a gene
2. Create the initial SeqRecord from the feature with `featuretype='gene'`
3. Collect all features with geneID belonging to SeqRecord from [2] (for each)
4. Annotate SeqRecord with features
5. Write SeqRecord with features

In [None]:
#| export
def get_gene_id_from_attribute_dict(attribute_dict: dict):
    dbxref = attribute_dict.get("Dbxref", {})
    if isinstance(dbxref, str):
        return None
    elif isinstance(dbxref, dict):
        return dbxref.get("GeneID", None)
    else:
        return None


def get_gff_gene_features(annotations: pd.DataFrame, attribute_dict_column: str) -> pd.DataFrame:
    """
    Filter an annotations DataFrame for those rows that annotate a gene in Entrez.
    """
    row_annotates_gene = annotations.loc[:, attribute_dict_column].apply(
        lambda attr_dict: get_gene_id_from_attribute_dict(attr_dict) is not None
    )
    return annotations[row_annotates_gene == True]

In [None]:
#| hide
test_gene_annotations = get_gff_gene_features(test_annotations, "attributes_dict")
display(test_gene_annotations.shape[0], test_annotations.shape[0])
test_gene_annotations.head()

126374

134494

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes,attributes_dict
2,NC_000014.9,Curated Genomic,pseudogene,16024658,16025641,,+,,"ID=gene-DUX4L48;Dbxref=GeneID:107105251,HGNC:H...","{'ID': 'gene-DUX4L48', 'Dbxref': {'GeneID': '1..."
3,NC_000014.9,Curated Genomic,pseudogene,16028061,16028716,,+,,"ID=gene-PCMTD1P6;Dbxref=GeneID:107105255,HGNC:...","{'ID': 'gene-PCMTD1P6', 'Dbxref': {'GeneID': '..."
4,NC_000014.9,Curated Genomic,exon,16028061,16028716,,+,,ID=id-PCMTD1P6;Parent=gene-PCMTD1P6;Dbxref=Gen...,"{'ID': 'id-PCMTD1P6', 'Parent': 'gene-PCMTD1P6..."
5,NC_000014.9,Curated Genomic,pseudogene,16029944,16030886,,+,,"ID=gene-DUX4L49;Dbxref=GeneID:107105252,HGNC:H...","{'ID': 'gene-DUX4L49', 'Dbxref': {'GeneID': '1..."
6,NC_000014.9,Curated Genomic,exon,16029944,16030886,,+,,ID=id-DUX4L49;Parent=gene-DUX4L49;Dbxref=GeneI...,"{'ID': 'id-DUX4L49', 'Parent': 'gene-DUX4L49',..."


In [None]:
#| export
def write_gene_sequence(write_path: Path, gene_id: str, sequence_record: SeqIO.SeqRecord):
    sequence_path = write_path / f"{gene_id}.fasta"
    with sequence_path.open("w+") as out:
        SeqIO.write([sequence_record], out, "fasta")

In [None]:
#| export
def get_sequence_from_file(
    start: int, end: int, strand: str = "+", 
    sequence_record: SeqIO.SeqRecord = None, fasta_path: Path = None) -> SeqIO.SeqRecord:
    """
    The Fasta file at fasta_path is a single record.
    If strand is positive, index and return sequence.
    If strand is negative, reverse the index and return.
    """
    if sequence_record is None and fasta_path is None:
        raise ValueError("Both sequence_record and fasta_path cannot be None")
    if sequence_record is None:
        sequence_record = next(SeqIO.parse(fasta_path.resolve(), "fasta"))
    if strand == "+":
        selected_sequence = sequence_record[start: end + 1]
    if strand == "-":
        selected_sequence = sequence_record[start - 1: end]
        selected_sequence = selected_sequence.reverse_complement()
    return selected_sequence.upper()

In [None]:
#| export
# For every annotation, get genes, make fasta file for each
def write_annotation_sequences(args: dict):
    # Get input args
    annotation_file = Path(args.get("annotation_file"))
    genome_path = Path(args.get("genome_path"))
    sequences_path = Path(args.get("sequences_path"))
    # Load annotations
    annotation_file_df = pd.read_csv(annotation_file)
    # Get genes, extract attributes
    annotation_file_genes = annotation_file_df[annotation_file_df.type == "gene"]
    annotation_file_genes.loc[:, 'attr_dict'] = annotation_file_genes.attributes.apply(make_attribute_dict_from_string)
    annotation_file_genes.loc[:, 'gene_id'] = annotation_file_genes.attr_dict.apply(get_gene_id_from_attributes)
    # Filter those genes already extracted and written
    annotation_file_genes.loc[:, 'sequence_written'] = annotation_file_genes.gene_id.apply(
        lambda gene_id: (sequences_path / f"{gene_id}.fasta").exists()
    )
    gene_ids = annotation_file_genes.gene_id.unique().tolist()
    annotation_file_genes = annotation_file_genes[annotation_file_genes.sequence_written == False]
    if annotation_file_genes.shape[0] == 0:
        return gene_ids
    # Get fasta sequences
    fasta_path = genome_path / f"{annotation_file.stem}.fasta"
    fasta_sequence_record = next(SeqIO.parse(fasta_path.resolve(), "fasta"))
    annotation_file_sequence_records = annotation_file_genes.apply(
        lambda row: get_sequence_from_file(
            row.start,
            row.end,
            row.strand,
            fasta_sequence_record
        ),
        axis=1
    )
    annotation_file_sequence_records.name = "sequence_record"
    write_annotation_sequences = pd.concat(
        [
            annotation_file_sequence_records,
            annotation_file_genes.gene_id
        ], 
        axis=1
    )
    # Write sequences
    write_annotation_sequences.apply(
        lambda row: write_gene_sequence(
            write_path = sequences_path,
            gene_id = row.gene_id,
            sequence_record = row.sequence_record
        ),
        axis=1
    )
    return write_annotation_sequences.gene_id.unique().tolist()

In [None]:
#| hide
annotation_file_list = list(annotations_path.glob("*.csv"))
tasks = [
    {
        "annotation_file": annotation_path,
        "genome_path": genome_path,
        "sequences_path" : sequences_path
    } for annotation_path in annotation_file_list
]

In [None]:
#| hide
pool = Pool(os.cpu_count() - 1)
written_genes = set()

progress_bar = tqdm(total=len(annotation_file_list))

try:
    for result in pool.imap_unordered(write_annotation_sequences, tasks):
        written_genes.update(result)
        progress_bar.update(1)
except Exception as e:
    raise e
finally:
    progress_bar.close()
    pool.close()
    
len(written_genes)

  0%|          | 0/705 [00:00<?, ?it/s]

42446

## Make gene to protein map

In [None]:
#| export
def get_protein_id_from_attributes(attributes: dict):
    protein_id = attributes.get("protein_id", None)
    return protein_id


def get_gene_to_protein_map_from_annotations(annotation_file_path: Path) -> list[str]:
    annotations = pd.read_csv(annotation_file_path)
    genes = annotations[annotations.type == "CDS"]
    # Gather all gene ids
    gene_attributes = genes.attributes.apply(make_attribute_dict_from_string)
    gene_ids = gene_attributes.apply(get_gene_id_from_attributes)
    gene_ids.name = "gene_id"
    gene_protein_ids = gene_attributes.apply(get_protein_id_from_attributes)
    gene_protein_ids.name = "protein_id"
    gene_map = pd.concat([gene_ids, gene_protein_ids], axis=1).dropna().drop_duplicates()
    return gene_map.set_index("gene_id").protein_id.to_dict()

In [None]:
#| hide
test_gene_to_protein_map = {}
annotation_gene_to_protein_map = get_gene_to_protein_map_from_annotations(test_annotation_file)
test_gene_to_protein_map.update(annotation_gene_to_protein_map)

In [None]:
test_gene_to_protein_map.get("440153")

'NP_001013372.1'

In [None]:
#| hide
protein_file_paths = set((data_path / "protein").glob("*.fasta"))
gene_protein_files = set(
    [
        data_path / "protein" / (protein_id + ".fasta") for protein_id in test_gene_to_protein_map.values()
    ]
)

In [None]:
#| hide
gene_protein_files - protein_file_paths

set()

In [None]:
#| hide
# Get all gene docs
gene_to_protein_map = {}

for annotation_file in tqdm(annotation_file_list):
    annotation_gene_to_protein_map = get_gene_to_protein_map_from_annotations(annotation_file)
    gene_to_protein_map.update(annotation_gene_to_protein_map)

  0%|          | 0/705 [00:00<?, ?it/s]

In [None]:
#| hide
with (data_path / "entrez_gene_to_protein_map.json").open("w+") as out:
    json.dump(gene_to_protein_map, out)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()