In [None]:
#default_exp annotation

# Annotation Workflow

> Loads and commits manual annotations.

Annotations are stored in a CSV file, indexed by file name, which follows the same format and overrides to the annotated `metadata.csv` rows.  
Some rows have to be removed, this is a special case which is represented by keeping the `filename` column, but replacing all others with `<delete>`.  

The annotations are applied to the final dataset by the `apply_annotations` function.  

The usual flow is:  
1. Load dataset metadata (fully or for a single source)  
1. Load annotations  
1. Apply annotations to metadata  

In general this is run automatically, with the sample dataset code, in `index`, but you can call it manually and save the result.  

In [None]:
#|export

import pandas as pd
from collections import namedtuple

In [None]:
#exporti

COLUMNS = ["filename", "category", "label", "extra", "source", "version"]
DELETE_KEYWORD = "<delete>"

Annotation = namedtuple("Annotation", "replaces deletes")

In [None]:
#export

def load_annotations(annotation_path: str) -> Annotation:
    """Loads the annotations from a csv file."""
    annotations = pd.read_csv(annotation_path)
    return Annotation(annotations.loc[annotations["category"] != DELETE_KEYWORD],
                      annotations.loc[annotations["category"] == DELETE_KEYWORD])

def apply_annotations(annotations: Annotation, metadata: pd.DataFrame) -> pd.DataFrame:
    """Applies the annotations to the metadata (Assumes no duplicates!!)."""
    replaced = metadata.loc[~metadata["filename"].isin(annotations.deletes["filename"])]
    replaced = pd.concat([replaced, annotations.replaces])
    return replaced.drop_duplicates(subset=["filename"], keep="last")

In [None]:
# Testing replacement with duplicates
rows = [
    # Out of order intentionally
    ["test2.wav", "Test", "Test", None, "test", -1],
    ["test.wav", "Replaced", "Test,Replaced", "Replaced", "test", -1]
]
deletes = [["delete.wav", DELETE_KEYWORD, DELETE_KEYWORD, DELETE_KEYWORD, DELETE_KEYWORD, -1]]
annotations = Annotation(
    pd.DataFrame(rows, columns=COLUMNS),
    pd.DataFrame(deletes, columns=COLUMNS)
)

rows = [
    ["test.wav", "Test", "Test", None, "test", 1],
    ["delete.wav", "Test", "Test,Delete", None, "test", 1],
    ["fine.wav", "Test", "Fine", "all good", "test", 1],
    ["test2.wav", "Wrong", "Test", None, "test", 1]
]
metadata = pd.DataFrame(rows, columns=COLUMNS)

expected = pd.DataFrame([
    ["test.wav", "Replaced", "Test,Replaced", "Replaced", "test", -1],
    ["fine.wav", "Test", "Fine", "all good", "test", 1],
    ["test2.wav", "Test", "Test", None, "test", -1]
], columns=COLUMNS)
actual = apply_annotations(annotations, metadata)

assert actual.sort_values(by="filename", ignore_index=True) \
    .equals(expected.sort_values(by="filename", ignore_index=True))

In [None]:
#export

def delete_row(filename: str) -> list[str]:
    """Deletes a row from the annotations DataFrame."""
    row = [DELETE_KEYWORD] * len(COLUMNS)
    row[0] = filename
    row[-1] = -1
    return row

In [None]:
assert delete_row("test.wav") == ['test.wav', '<delete>', '<delete>', '<delete>', '<delete>', -1]