# Getting annotations from gff3

> "Using I/O to break up the genome annotations by accession number."

In [None]:
#| default_exp features.annotations

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import math

In [None]:
#| hide
from yaml import safe_load

with open("../config.yml", "r") as f:
    config = safe_load(f)
    
data_path = Path(config.get("data_path"))

In [None]:
#| hide
annotations_file = data_path / "raw/GRCh38_latest_genomic.gff"

In [None]:
#| export
def get_comment_rows(annotation_file: Path):
    comment_idx = []
    with annotation_file.open("r") as f:
        for idx, line in enumerate(f):
            if line.startswith("#"):
                comment_idx.append(idx)
    return comment_idx


def get_num_annotations(annotation_file: Path):
    num_rows = 0
    with annotation_file.open("r") as f:
        for line in f:
            if not line.startswith("#"):
                num_rows += 1
    return num_rows


def get_annotations_iter(annotation_file: Path, comment_rows: list[str], chunk_size = 100000):
    annotations_iter = pd.read_csv(
        annotation_file, 
        sep="\t", 
        skiprows=comment_rows, 
        header=0,
        names=[
            "seqid", 
            "source",
            "type",
            "start",
            "end",
            "score",
            "strand",
            "phase",
            "attributes"
        ],
        na_values=["."],
        chunksize=chunk_size
    )
    return annotations_iter

In [None]:
#| hide
comment_rows = get_comment_rows(annotations_file)
num_annotations = get_num_annotations(annotations_file)

In [None]:
#| export
def write_seqid_annotations(seq_id: int, seq_id_annotations: pd.DataFrame, write_path: Path):
    seq_id_write_path = write_path / f"{seq_id}.csv"
    mode = "w+"
    header = True
    if seq_id_write_path.exists():
        mode = "a"
        header = False
    seq_id_annotations.to_csv(
        seq_id_write_path, 
        index=False, 
        header=header, 
        mode=mode
    )
    

def breakdown_annotations_by_seqid(
    annotation_file: Path, 
    write_path: Path,
    num_annotations: int,
    comment_rows: list[int],
    chunk_size: int = 100000
):
    for path in write_path.glob("*.csv"):
        path.unlink()
    annotations = get_annotations_iter(
        annotation_file,
        comment_rows=comment_rows,
        chunk_size=chunk_size
    )
    progress_bar = tqdm(total=math.ceil(num_annotations / chunk_size))
    for annotations_chunk in annotations:
        for seq_id, seq_id_annotations in tqdm(annotations_chunk.groupby("seqid"), position=1, leave=False):
            write_seqid_annotations(seq_id, seq_id_annotations, write_path)
        progress_bar.update(1)
    progress_bar.close()

In [None]:
#| hide
annotations_path = data_path / "annotations"
if not annotations_path.exists():
    annotations_path.mkdir()

breakdown_annotations_by_seqid(
    annotation_file=annotations_file,
    write_path=annotations_path,
    num_annotations=num_annotations,
    comment_rows=comment_rows
)

  0%|          | 0/48 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/132 [00:00<?, ?it/s]

  0%|          | 0/168 [00:00<?, ?it/s]

  0%|          | 0/218 [00:00<?, ?it/s]

  0%|          | 0/83 [00:00<?, ?it/s]

  0%|          | 0/17 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()