# Generate Human Triplets

This notebook simply samples from the FMA small dataset to create triplets of audio samples with a defined anchor.
Assigning the positive and negative samples will be left to human annotators in a later step.

In [1]:
import pandas as pd
import numpy as np
import os
from typing import Literal
import ast
import random

AUDIOS_DIR = os.path.join("..", "data", "fma_small")
METADATA_PATH = os.path.join("..", "data", "fma_metadata")
SYNTHETIC_TRIPLETS_FILE = os.path.join("..", "data", "synthetic_triplets.csv")
HUMAN_TRIPLETS_FILE = os.path.join("..", "data", "human_triplets.csv")

MetadataFileName = Literal["tracks"]
MetadataFiles: dict[MetadataFileName, str] = {
    "tracks": "tracks.csv",
}
MetadataPaths: dict[MetadataFileName, str] = {
    key: os.path.join(METADATA_PATH, filename)
    for key, filename in MetadataFiles.items()
}

In [2]:
def load_metadata(filename: MetadataFileName) -> pd.DataFrame:
    filepath = MetadataPaths[filename]
    match filename:
        case "raw_albums" | "raw_artists" | "raw_genres" | "raw_tracks":
            return pd.read_csv(filepath, index_col=0)
        case "genres":
            genres = pd.read_csv(filepath, index_col=0)
            genres["parent"] = genres["parent"].replace(np.nan, None)
            return genres
        case "features" | "echonest":
            return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
        case "tracks":
            tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

            COLUMNS = [
                ("track", "tags"),
                ("album", "tags"),
                ("artist", "tags"),
                ("track", "genres"),
                ("track", "genres_all"),
            ]
            for column in COLUMNS:
                tracks[column] = tracks[column].map(ast.literal_eval)

            COLUMNS = [
                ("track", "date_created"),
                ("track", "date_recorded"),
                ("album", "date_created"),
                ("album", "date_released"),
                ("artist", "date_created"),
                ("artist", "active_year_begin"),
                ("artist", "active_year_end"),
            ]
            for column in COLUMNS:
                tracks[column] = pd.to_datetime(tracks[column])

            SUBSETS = ("small", "medium", "large")
            try:
                tracks["set", "subset"] = tracks["set", "subset"].astype(
                    "category", categories=SUBSETS, ordered=True
                )
            except (ValueError, TypeError):
                # the categories and ordered arguments were removed in pandas 0.25
                tracks["set", "subset"] = tracks["set", "subset"].astype(
                    pd.CategoricalDtype(categories=SUBSETS, ordered=True)
                )

            COLUMNS = [
                ("track", "genre_top"),
                ("track", "license"),
                ("album", "type"),
                ("album", "information"),
                ("artist", "bio"),
            ]
            for column in COLUMNS:
                tracks[column] = tracks[column].astype("category")

            return tracks

## Load the synthetic triplets generated in the previous notebook for reference

we do this so we can avoid mixing synthetic and human triplets (which is important for if we want to use the human triplets as a validation/test set later on)

In [3]:
synthetic_triplets_df = pd.read_csv(SYNTHETIC_TRIPLETS_FILE, header=[0,1])
tracks_df = load_metadata("tracks")
small_tracks_df = tracks_df[tracks_df["set", "subset"] == "small"]

In [4]:
synthetic_triplets_df.head()

Unnamed: 0_level_0,anchor,positive,negative,anchor,positive,negative,anchor,positive,negative,anchor,positive,negative,anchor,positive,negative
Unnamed: 0_level_1,track_id,track_id,track_id,genre_top,genre_top,genre_top,album_id,album_id,album_id,artist_id,artist_id,artist_id,genres_all,genres_all,genres_all
0,145742,91306,108867,Hip-Hop,Hip-Hop,International,21775,15117,16898,18872,11216,3936,[21],[21],"[2, 118]"
1,123979,89846,90590,Hip-Hop,Hip-Hop,Pop,18752,14962,15015,20707,16894,7444,"[539, 21]","[539, 21]","[10, 76]"
2,123969,75930,152570,Hip-Hop,Hip-Hop,Instrumental,18752,13303,22647,20707,15323,24088,"[539, 21]","[811, 21]","[267, 1235]"
3,123442,137900,71276,Hip-Hop,Hip-Hop,Rock,18671,20720,12642,20648,14456,1132,"[539, 100, 21]",[21],"[25, 45, 12, 53]"
4,141562,13767,59727,Hip-Hop,Hip-Hop,International,21277,3234,10964,22568,3773,6344,"[100, 21]",[21],[2]


In [5]:
tracks_df.head()

Unnamed: 0_level_0,album,album,album,album,album,album,album,album,album,album,...,track,track,track,track,track,track,track,track,track,track
Unnamed: 0_level_1,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,tags,...,information,interest,language_code,license,listens,lyricist,number,publisher,tags,title
track_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,4656,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1293,,3,,[],Food
3,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1470,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,514,,4,,[],Electric Ave
5,0,2008-11-26 01:44:45,2009-01-05,,4,1,<p></p>,6073,,[],...,,1933,en,Attribution-NonCommercial-ShareAlike 3.0 Inter...,1151,,6,,[],This World
10,0,2008-11-26 01:45:08,2008-02-06,,4,6,,47632,,[],...,,54881,en,Attribution-NonCommercial-NoDerivatives (aka M...,50135,,1,,[],Freeway
20,0,2008-11-26 01:45:05,2009-01-06,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,[],...,,978,en,Attribution-NonCommercial-NoDerivatives (aka M...,361,,3,,[],Spiritual Level


## Generate the requested number of new triplets

In [20]:
def triplet_exists(triplets_df: pd.DataFrame, anchor_id: int, positive_id: int, negative_id: int) -> bool:
    existing_triplet = triplets_df[
        (triplets_df["anchor", "track_id"] == anchor_id) &
        (triplets_df["positive", "track_id"] == positive_id) &
        (triplets_df["negative", "track_id"] == negative_id)
    ]
    return not existing_triplet.empty

NUM_TOTAL_TRIPLETS = 2000
random.seed(400)

new_triplets = []
while len(new_triplets) < NUM_TOTAL_TRIPLETS:
    anchor = small_tracks_df.sample(n=1, random_state=random.randint(0,10000)).iloc[0]
    if anchor.name in synthetic_triplets_df[("anchor", "track_id")].values:
        continue

    other_candidates = small_tracks_df[
        (small_tracks_df.index != anchor.name)
    ]
    if other_candidates.empty:
        continue
    others = other_candidates.sample(n=2, replace=True, random_state=random.randint(0,10000))
    positive = others.iloc[0]
    negative = others.iloc[1]

    if triplet_exists(synthetic_triplets_df, anchor.name, positive.name, negative.name):
        continue
    if triplet_exists(synthetic_triplets_df, anchor.name, negative.name, positive.name):
        continue
    if new_triplets and any(
        (triplet["song1"] == anchor.name and
         triplet["song2"] == positive.name and
         triplet["song3"] == negative.name) or
        (triplet["song1"] == anchor.name and
         triplet["song2"] == negative.name and
         triplet["song3"] == positive.name)
        for triplet in new_triplets
    ):
        continue

    new_triplets.append(pd.Series({
        "song1": anchor.name,
        "song2": positive.name,
        "song3": negative.name,
        "anchor": None,
        "positive": None,
    }))

    # print progress
    print(f"Generated {len(new_triplets)} / {NUM_TOTAL_TRIPLETS} triplets.", end="\r")

human_triplets = pd.DataFrame(new_triplets, columns=["song1", "song2", "song3", "anchor", "positive"], dtype=object)
human_triplets[["song1", "song2", "song3"]] = human_triplets[["song1", "song2", "song3"]].astype(int)

Generated 2000 / 2000 triplets.

## Save the triplets to a CSV file

In [None]:
human_triplets.to_csv(HUMAN_TRIPLETS_FILE)
print(f"\nSaved {len(human_triplets)} human triplets to {HUMAN_TRIPLETS_FILE}.")


Saved 2000 human triplets to ../data/human_triplets.csv.
