# Getting training data for translation

> "Build our training data using the annotations we brokedown and the genome sequences we've stored."

In [None]:
#| default_exp features.annotations

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from Bio import SeqIO, Entrez
import re
import json
from multiprocessing import Pool
import os
import sys

pd.options.mode.chained_assignment = None



In [None]:
#| hide
from yaml import safe_load

tqdm.pandas(ncols=80, leave=False)

with open("../config.yml") as f:
    config = safe_load(f)

In [None]:
#| hide
data_path = Path(config.get("data_path"))
annotations_path = data_path / "annotations"
genome_path = data_path / "genome"
sequences_path = data_path / "entrez_genes_sequences"
if not sequences_path.exists():
    sequences_path.mkdir()

Entrez.email = config.get("email")
Entrez.api_key = config.get("nih_api_key")

## Get gene sequences

In [None]:
#| hide
test_annotation_file = annotations_path / "NC_000014.9.csv"

In [None]:
#| hide
test_annotations_df = pd.read_csv(test_annotation_file)
test_annotations_df.head()

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,NC_000014.9,RefSeq,region,1,107043718,,+,,ID=NC_000014.9:1..107043718;Dbxref=taxon:9606;...
1,NC_000014.9,RefSeq,centromere,16000001,18173523,,+,,ID=id-NC_000014.9:16000001..18173523;Note=Line...
2,NC_000014.9,Curated Genomic,pseudogene,16024658,16025641,,+,,"ID=gene-DUX4L48;Dbxref=GeneID:107105251,HGNC:H..."
3,NC_000014.9,Curated Genomic,pseudogene,16028061,16028716,,+,,"ID=gene-PCMTD1P6;Dbxref=GeneID:107105255,HGNC:..."
4,NC_000014.9,Curated Genomic,exon,16028061,16028716,,+,,ID=id-PCMTD1P6;Parent=gene-PCMTD1P6;Dbxref=Gen...


In [None]:
#| hide
test_annotations_df.source.value_counts()

source
Gnomon                 70824
BestRefSeq             53068
RefSeqFE                4698
RefSeq                  2925
Curated Genomic         2520
BestRefSeq%2CGnomon      303
cmsearch                  94
tRNAscan-SE               62
Name: count, dtype: int64

In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "gene"].iloc[0, :-1]

seqid     NC_000014.9
source     BestRefSeq
type             gene
start        18601117
end          18602097
score             NaN
strand              +
phase             NaN
Name: 22, dtype: object

In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "gene"].strand.value_counts()

strand
+    850
-    730
Name: count, dtype: int64

In [None]:
#| hide
test_annotations_df[
    (test_annotations_df.type == "gene") &
    (test_annotations_df.strand == "-")
]

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
30,NC_000014.9,BestRefSeq,gene,18630536,18633634,,-,,"ID=gene-LINC02297;Dbxref=GeneID:642426,HGNC:HG..."
97,NC_000014.9,BestRefSeq,gene,18976748,18980793,,-,,ID=gene-LOC100508046;Dbxref=GeneID:100508046;N...
145,NC_000014.9,BestRefSeq,gene,19268803,19384288,,-,,ID=gene-LINC01297-DUXAP10-NBEAP6;Dbxref=GeneID...
187,NC_000014.9,BestRefSeq,gene,19344329,19384288,,-,,"ID=gene-LINC01297;Dbxref=GeneID:115694671,HGNC..."
194,NC_000014.9,BestRefSeq,gene,19402486,19434341,,-,,"ID=gene-POTEG;Dbxref=GeneID:404785,HGNC:HGNC:3..."
...,...,...,...,...,...,...,...,...,...
131489,NC_000014.9,Curated Genomic,gene,106790692,106791153,,-,,"ID=gene-IGHV3-72;Dbxref=GeneID:28410,HGNC:HGNC..."
131495,NC_000014.9,Curated Genomic,gene,106802692,106803153,,-,,"ID=gene-IGHV3-73;Dbxref=GeneID:28409,HGNC:HGNC..."
131501,NC_000014.9,Curated Genomic,gene,106810440,106810895,,-,,"ID=gene-IGHV3-74;Dbxref=GeneID:28408,HGNC:HGNC..."
131532,NC_000014.9,BestRefSeq,gene,106850885,106850999,,-,,"ID=gene-MIR5195;Dbxref=GeneID:100847062,HGNC:H..."


In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "gene"].iloc[0, -1]

'ID=gene-OR11H12;Dbxref=GeneID:440153,HGNC:HGNC:30738;Name=OR11H12;description=olfactory receptor family 11 subfamily H member 12;gbkey=Gene;gene=OR11H12;gene_biotype=protein_coding'

In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "CDS"].iloc[0, :-1]

seqid     NC_000014.9
source     BestRefSeq
type              CDS
start        18601117
end          18602097
score             NaN
strand              +
phase             0.0
Name: 25, dtype: object

In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "CDS"].iloc[0, -1]

'ID=cds-NP_001013372.1;Parent=rna-NM_001013354.1;Dbxref=CCDS:CCDS32017.1,Ensembl:ENSP00000449002.1,GeneID:440153,GenBank:NP_001013372.1,HGNC:HGNC:30738;Name=NP_001013372.1;gbkey=CDS;gene=OR11H12;product=olfactory receptor 11H12;protein_id=NP_001013372.1;tag=MANE Select'

In [None]:
#| hide
test_annotations_df[test_annotations_df.type == "CDS"].strand.value_counts()

strand
+    31797
-    20995
Name: count, dtype: int64

In [None]:
#| export
def get_sequence_from_file(
    start: int, end: int, strand: str = "+", 
    sequence_record: SeqIO.SeqRecord = None, fasta_path: Path = None) -> SeqIO.SeqRecord:
    """
    The Fasta file at fasta_path is a single record.
    If strand is positive, index and return sequence.
    If strand is negative, reverse the index and return.
    """
    if sequence_record is None and fasta_path is None:
        raise ValueError("Both sequence_record and fasta_path cannot be None")
    if sequence_record is None:
        sequence_record = next(SeqIO.parse(fasta_path.resolve(), "fasta"))
    if strand == "+":
        selected_sequence = sequence_record[start: end + 1]
    if strand == "-":
        selected_sequence = sequence_record[start - 1: end]
        selected_sequence = selected_sequence.reverse_complement()
    return selected_sequence.upper()

In [None]:
#| export
def make_attribute_dict_from_string(attr_str: str):
    attr_list = [attr.split("=") for attr in attr_str.split(";")]
    attr_dict = {attr_tuple[0]: attr_tuple[1] for attr_tuple in attr_list}
    return attr_dict


def get_gene_id_from_attributes(attributes: dict):
    dbxref_string = attributes.get("Dbxref", None)
    if dbxref_string is None:
        return
    pattern = re.compile('.*GeneID:(\d*).*')
    match = pattern.search(dbxref_string)
    gene_id = match.group(1)
    return gene_id


def write_gene_sequence(write_path: Path, gene_id: str, sequence_record: SeqIO.SeqRecord):
    sequence_path = write_path / f"{gene_id}.fasta"
    with sequence_path.open("w+") as out:
        SeqIO.write([sequence_record], out, "fasta")

In [None]:
#| export
# For every annotation, get genes, make fasta file for each
def write_annotation_sequences(args: dict):
    # Get input args
    annotation_file = Path(args.get("annotation_file"))
    genome_path = Path(args.get("genome_path"))
    sequences_path = Path(args.get("sequences_path"))
    # Load annotations
    annotation_file_df = pd.read_csv(annotation_file)
    # Get genes, extract attributes
    annotation_file_genes = annotation_file_df[annotation_file_df.type == "gene"]
    annotation_file_genes.loc[:, 'attr_dict'] = annotation_file_genes.attributes.apply(make_attribute_dict_from_string)
    annotation_file_genes.loc[:, 'gene_id'] = annotation_file_genes.attr_dict.apply(get_gene_id_from_attributes)
    # Filter those genes already extracted and written
    annotation_file_genes.loc[:, 'sequence_written'] = annotation_file_genes.gene_id.apply(
        lambda gene_id: (sequences_path / f"{gene_id}.fasta").exists()
    )
    gene_ids = annotation_file_genes.gene_id.unique().tolist()
    annotation_file_genes = annotation_file_genes[annotation_file_genes.sequence_written == False]
    if annotation_file_genes.shape[0] == 0:
        return gene_ids
    # Get fasta sequences
    fasta_path = genome_path / f"{annotation_file.stem}.fasta"
    fasta_sequence_record = next(SeqIO.parse(fasta_path.resolve(), "fasta"))
    annotation_file_sequence_records = annotation_file_genes.apply(
        lambda row: get_sequence_from_file(
            row.start,
            row.end,
            row.strand,
            fasta_sequence_record
        ),
        axis=1
    )
    annotation_file_sequence_records.name = "sequence_record"
    write_annotation_sequences = pd.concat(
        [
            annotation_file_sequence_records,
            annotation_file_genes.gene_id
        ], 
        axis=1
    )
    # Write sequences
    write_annotation_sequences.apply(
        lambda row: write_gene_sequence(
            write_path = sequences_path,
            gene_id = row.gene_id,
            sequence_record = row.sequence_record
        ),
        axis=1
    )
    return write_annotation_sequences.gene_id.unique().tolist()

In [None]:
#| hide
annotation_file_list = list(annotations_path.glob("*.csv"))
tasks = [
    {
        "annotation_file": annotation_path,
        "genome_path": genome_path,
        "sequences_path" : sequences_path
    } for annotation_path in annotation_file_list
]

In [None]:
#| hide
pool = Pool(os.cpu_count() - 1)
written_genes = set()

progress_bar = tqdm(total=len(annotation_file_list))

try:
    for result in pool.imap_unordered(write_annotation_sequences, tasks):
        written_genes.update(result)
        progress_bar.update(1)
except Exception as e:
    raise e
finally:
    progress_bar.close()
    pool.close()
    
len(written_genes)

  0%|          | 0/705 [00:00<?, ?it/s]

42446

## Make gene to protein map

In [None]:
#| export
def get_protein_id_from_attributes(attributes: dict):
    protein_id = attributes.get("protein_id", None)
    return protein_id


def get_gene_to_protein_map_from_annotations(annotation_file_path: Path) -> list[str]:
    annotations = pd.read_csv(annotation_file_path)
    genes = annotations[annotations.type == "CDS"]
    # Gather all gene ids
    gene_attributes = genes.attributes.apply(make_attribute_dict_from_string)
    gene_ids = gene_attributes.apply(get_gene_id_from_attributes)
    gene_ids.name = "gene_id"
    gene_protein_ids = gene_attributes.apply(get_protein_id_from_attributes)
    gene_protein_ids.name = "protein_id"
    gene_map = pd.concat([gene_ids, gene_protein_ids], axis=1).dropna().drop_duplicates()
    return gene_map.set_index("gene_id").protein_id.to_dict()

In [None]:
#| hide
test_gene_to_protein_map = {}
annotation_gene_to_protein_map = get_gene_to_protein_map_from_annotations(test_annotation_file)
test_gene_to_protein_map.update(annotation_gene_to_protein_map)

In [None]:
test_gene_to_protein_map.get("440153")

'NP_001013372.1'

In [None]:
#| hide
protein_file_paths = set((data_path / "protein").glob("*.fasta"))
gene_protein_files = set(
    [
        data_path / "protein" / (protein_id + ".fasta") for protein_id in test_gene_to_protein_map.values()
    ]
)

In [None]:
#| hide
gene_protein_files - protein_file_paths

set()

In [None]:
#| hide
# Get all gene docs
gene_to_protein_map = {}

for annotation_file in tqdm(annotation_file_list):
    annotation_gene_to_protein_map = get_gene_to_protein_map_from_annotations(annotation_file)
    gene_to_protein_map.update(annotation_gene_to_protein_map)

  0%|          | 0/705 [00:00<?, ?it/s]

In [None]:
#| hide
with (data_path / "entrez_gene_to_protein_map.json").open("w+") as out:
    json.dump(gene_to_protein_map, out)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()