# Mapping of ontology terms to dataset labels
-- Ben De Meurichy

In [None]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer


## Load labels from audioset dataset

In [None]:
audioset_labels = pd.read_csv("./class_labels_indices.csv")

audioset_labels.head()

## set up embedding model and vector store

In [None]:
model = SentenceTransformer("google/embeddinggemma-300m")
#model = SentenceTransformer("all-MiniLM-L6-v2")


vectors = model.encode(audioset_labels['display_name'].tolist(),normalize_embeddings=False)
# Normalize the vectors to unit length
vectors = vectors / np.linalg.norm(vectors, axis=1, keepdims=True)

In [None]:

dim = vectors.shape[1]
vector_store = faiss.IndexFlatIP(dim)
label_vectors = vectors.astype('float32')
vector_store.add(label_vectors)

## Match ontology terms to dataset labels

### Get ontology terms from turtle file

In [None]:
import rdflib

g = rdflib.Graph()
g.parse("../../anthropogenic_ontology.ttl", format="turtle")

Query only leaf nodes and their labels

In [None]:
from rdflib.namespace import RDF, RDFS, OWL

BASE = rdflib.Namespace(
    "http://www.semanticweb.org/dbotteld/ontologies/2025/6/sound_ontology#"
)

query = """
SELECT ?cls ?label
WHERE {
  ?cls a owl:Class .
  FILTER NOT EXISTS { ?sub rdfs:subClassOf ?cls . }
  OPTIONAL { ?cls rdfs:label ?label . }
}
"""

labels = []

for row in g.query(query, initNs={"owl": OWL, "rdfs": RDFS, "base": BASE}):
    label = row.label if row.label else row.cls
    labels.append(str(label))
    print(label)

### create matching panda dataframe
Score is cosine similarity

In [None]:
mapping_data = []

for ontology_label in labels:
    query_vector = model.encode([ontology_label], normalize_embeddings=False).astype('float32')
    query_vector = query_vector / np.linalg.norm(query_vector)
    
    scores, idx = vector_store.search(query_vector, k=3)
    
    audioset_labels_list = [audioset_labels["display_name"].iloc[i] for i in idx[0]]
    # round to 4 decimal places and convert to float
    scores = [float(round(float(score), 4)) for score in scores[0]]
    
    mapping_data.append({
        'ontology_label': ontology_label,
        'audioset_label_indices': idx.tolist()[0],
        'audioset_labels': audioset_labels_list,
        'scores': scores
    })

label_mapping_df = pd.DataFrame(mapping_data)
label_mapping_df

## Custom labels from brazilian dataset

Mapping of portuguese labels don't seem to work that well because they are structured as "antro_{simplified_label_name}". This mostly works if the portuguese label doesn't originally contain accents and wasn't modified too much.

In [None]:
translations_dict = {
    "antro_aviao": "airplane",
    "antro_carro": "car",
    "antro_veiculo": "vehicle",
    "antro_motor": "motor",
    "antro_ni": "unknown anthropogenic",
    "antro_serra": "chainsaw",
    "antro_caminhao": "truck",
    "antro_voz": "voice",
    "antro_martelada": "hammering",
    "antro_moto": "motorcycle",
    "antro_humano": "human",
    "antro_assobio_humano": "whistling",
    "antro_trator": "tractor",
    "antro_turbina": "turbine",
    "antro_musica": "music",
    "antro_passos": "footsteps",
    "antro_buzina": "horn",
    "antro_facao": "machete",
    "antro_sirene": "siren",
}

In [None]:
brazilion_mapping_data = []
for key,val in translations_dict.items():
    # Portuguese matches
    query_vector_port = model.encode([key], normalize_embeddings=False).astype('float32')
    query_vector_port = query_vector_port / np.linalg.norm(query_vector_port)
    
    scores_port, idx_port = vector_store.search(query_vector_port, k=3)
    scores_port = [float(round(float(score), 4)) for score in scores_port[0]]
    
    audioset_labels_list_port = [audioset_labels["display_name"].iloc[i] for i in idx_port[0]]
    
    # English matches
    query_vector_en = model.encode([val], normalize_embeddings=False).astype('float32')
    query_vector_en = query_vector_en / np.linalg.norm(query_vector_en)
    
    scores_en, idx_en = vector_store.search(query_vector_en, k=3)
    scores_en = [float(round(float(score), 4)) for score in scores_en[0]]
    
    audioset_labels_list_en = [audioset_labels["display_name"].iloc[i] for i in idx_en[0]]

    brazilion_mapping_data.append(
        {
            "brazilian_label": key,
            "portuguese_audioset_labels": audioset_labels_list_port,
            "portuguese_audioset_index": idx_port.tolist()[0],
            "portuguese_scores": scores_port,
            "english_label_translation": val,
            "english_audioset_labels": audioset_labels_list_en,
            "english_audioset_index": idx_en.tolist()[0],
            "english_scores": scores_en
        }
    )

brazilian_label_mapping_df = pd.DataFrame(brazilion_mapping_data)
brazilian_label_mapping_df