# Generate synthetic triplets

This script generates synthetic triplet data for training the model.
Each triplet consists of an anchor song, a positive song (similar to the anchor), and a negative song (dissimilar to the anchor).

The synthetic data is generated to reinforce the following relationships:
- Songs in the same genre are similar.
  - Songs from different genres are dissimilar.
- Different segments of the same song are similar (songs are similar to themselves).
- Songs from the same artist are similar.
  - This relationship is weaker than genre similarity.
  - Songs from different artists are dissimilar.
- Songs from the same album are similar.
  - This relationship is stronger than artist similarity, but still weaker than genre similarity.
  - Songs from different albums are dissimilar.
- Songs are more similar to songs whose genres are closely related in the genre hierarchy.

In [1]:
import pandas as pd
import numpy as np
import os
from typing import Literal
import ast
import random

AUDIOS_DIR = os.path.join("..", "data", "fma_small")
METADATA_PATH = os.path.join("..", "data", "fma_metadata")
SYNTHETIC_TRIPLETS_FILE = os.path.join("..", "data", "synthetic_triplets.csv")

MetadataFileName = Literal["tracks", "genres", "features", "echonest", "raw_albums", "raw_artists", "raw_genres", "raw_tracks"]
MetadataFiles: dict[MetadataFileName, str] = {
    "tracks": "tracks.csv",
    "genres": "genres.csv",
    "features": "features.csv",
    "echonest": "echonest.csv",
    "raw_albums": "raw_albums.csv",
    "raw_artists": "raw_artists.csv",
    "raw_genres": "raw_genres.csv",
    "raw_tracks": "raw_tracks.csv",
}
MetadataPaths: dict[MetadataFileName, str] = {
    key: os.path.join(METADATA_PATH, filename)
    for key, filename in MetadataFiles.items()
}

In [2]:
def load_metadata(filename: MetadataFileName) -> pd.DataFrame:
    filepath = MetadataPaths[filename]
    match filename:
        case "raw_albums" | "raw_artists" | "raw_genres" | "raw_tracks":
            return pd.read_csv(filepath, index_col=0)
        case "genres":
            genres = pd.read_csv(filepath, index_col=0)
            genres["parent"] = genres["parent"].replace(np.nan, None)
            return genres
        case "features" | "echonest":
            return pd.read_csv(filepath, index_col=0, header=[0, 1, 2])
        case "tracks":
            tracks = pd.read_csv(filepath, index_col=0, header=[0, 1])

            COLUMNS = [
                ("track", "tags"),
                ("album", "tags"),
                ("artist", "tags"),
                ("track", "genres"),
                ("track", "genres_all"),
            ]
            for column in COLUMNS:
                tracks[column] = tracks[column].map(ast.literal_eval)

            COLUMNS = [
                ("track", "date_created"),
                ("track", "date_recorded"),
                ("album", "date_created"),
                ("album", "date_released"),
                ("artist", "date_created"),
                ("artist", "active_year_begin"),
                ("artist", "active_year_end"),
            ]
            for column in COLUMNS:
                tracks[column] = pd.to_datetime(tracks[column])

            SUBSETS = ("small", "medium", "large")
            try:
                tracks["set", "subset"] = tracks["set", "subset"].astype(
                    "category", categories=SUBSETS, ordered=True
                )
            except (ValueError, TypeError):
                # the categories and ordered arguments were removed in pandas 0.25
                tracks["set", "subset"] = tracks["set", "subset"].astype(
                    pd.CategoricalDtype(categories=SUBSETS, ordered=True)
                )

            COLUMNS = [
                ("track", "genre_top"),
                ("track", "license"),
                ("album", "type"),
                ("album", "information"),
                ("artist", "bio"),
            ]
            for column in COLUMNS:
                tracks[column] = tracks[column].astype("category")

            return tracks

## Exploring the dataset

In [3]:
tracks_df: pd.DataFrame = load_metadata("tracks")
genres_df: pd.DataFrame = load_metadata("genres")
raw_genres_df: pd.DataFrame = load_metadata("raw_genres")

In [4]:
def print_columns(df: pd.DataFrame, file: str) -> None:
    print(f"Columns in {file}:")
    for col in df.columns:
        print(f" - {col}")

print_columns(tracks_df, "tracks.csv")
print_columns(genres_df, "genres.csv")
print_columns(raw_genres_df, "raw_genres.csv")

Columns in tracks.csv:
 - ('album', 'comments')
 - ('album', 'date_created')
 - ('album', 'date_released')
 - ('album', 'engineer')
 - ('album', 'favorites')
 - ('album', 'id')
 - ('album', 'information')
 - ('album', 'listens')
 - ('album', 'producer')
 - ('album', 'tags')
 - ('album', 'title')
 - ('album', 'tracks')
 - ('album', 'type')
 - ('artist', 'active_year_begin')
 - ('artist', 'active_year_end')
 - ('artist', 'associated_labels')
 - ('artist', 'bio')
 - ('artist', 'comments')
 - ('artist', 'date_created')
 - ('artist', 'favorites')
 - ('artist', 'id')
 - ('artist', 'latitude')
 - ('artist', 'location')
 - ('artist', 'longitude')
 - ('artist', 'members')
 - ('artist', 'name')
 - ('artist', 'related_projects')
 - ('artist', 'tags')
 - ('artist', 'website')
 - ('artist', 'wikipedia_page')
 - ('set', 'split')
 - ('set', 'subset')
 - ('track', 'bit_rate')
 - ('track', 'comments')
 - ('track', 'composer')
 - ('track', 'date_created')
 - ('track', 'date_recorded')
 - ('track', 'dura

In [5]:
# Let's print all the available subsets in the tracks metadata
tracks_df["set", "subset"].value_counts()

(set, subset)
large     81574
medium    17000
small      8000
Name: count, dtype: int64

In [6]:
## let's filter to just the small dataset
small_tracks_df: pd.DataFrame = tracks_df[tracks_df["set", "subset"] == "small"]
small_tracks_df.shape

(8000, 52)

### Exploring Artists

In [7]:
small_tracks_df["artist", "name"].value_counts().head(30)

(artist, name)
Blue Dot Sessions                 208
Big Blood                          91
The Impossebulls                   68
Squire Tuck                        56
Kellee Maize                       55
Ergo Phizmiz                       54
TRG Banks                          47
Derek Clegg                        45
Jahzzar                            44
C-Doc                              42
Phemale                            41
David Szesztay                     39
Podington Bear                     38
Cory Gray                          36
Tha Silent Partner                 33
6th Sense                          33
SPCZ                               30
Turku, Nomads of the Silk Road     29
File Under Toner                   29
Garmisch                           29
Alaclair Ensemble                  29
Plusplus                           28
Fierbinteanu                       28
Sláinte                            27
Josh Woodward                      26
Lee Rosevere                       

### Exploring Albums

In [8]:
small_tracks_df["album", "title"].value_counts().head(30)

(album, title)
Live at the 2014 Golden Festival                                       50
Carcrashlander Instrumentals                                           36
It's A 6th Sense Beat Yo!!                                             33
This Is The End, Beautiful Friend                                      29
Massive Exposure [LCL19]                                               28
CC Affiliates Mixtape #1                                               25
Live at the 2013 Golden Festival                                       23
En attendant Noël...                                                   23
Audionautix: Tech, Urban, Dance                                        21
Atmospheric Electric Guitar                                            21
#CC-10 Europe Mixtape                                                  20
Best Bytes Volume 1                                                    19
Kevelin and Chestnut's Adventures                                      18
CC BY-SA 4.0           

### Exploring Genres

In [9]:
# let's look at the top genres
top_genres = small_tracks_df["track", "genre_top"].value_counts()
top_genres = top_genres[top_genres > 0]
top_genres

(track, genre_top)
Hip-Hop          1000
Folk             1000
Experimental     1000
Electronic       1000
Pop              1000
Rock             1000
International    1000
Instrumental     1000
Name: count, dtype: int64

In [10]:
# and their parent genres
for genre in top_genres.index:
    parent_id = genres_df[genres_df["title"] == genre]["parent"]
    print(f"{genre}: {parent_id}")

Hip-Hop: genre_id
21    0
Name: parent, dtype: int64
Folk: genre_id
17    0
Name: parent, dtype: int64
Experimental: genre_id
38    0
Name: parent, dtype: int64
Electronic: genre_id
15    0
Name: parent, dtype: int64
Pop: genre_id
10    0
Name: parent, dtype: int64
Rock: genre_id
12    0
Name: parent, dtype: int64
International: genre_id
2    0
Name: parent, dtype: int64
Instrumental: genre_id
1235    0
Name: parent, dtype: int64


In [11]:
# all genres
# ("track", "genres_all") contains a list of the genre ids, we want to flatten these lists and count occurrences
all_genres: pd.Series = small_tracks_df["track", "genres_all"].explode().value_counts()
all_genres

(track, genres_all)
21     1000
10     1000
17     1000
38     1000
12     1000
       ... 
619       5
538       4
171       4
185       3
174       2
Name: count, Length: 114, dtype: int64

So for this dataset, none of the top genres have common ancestors, but there are around 114 genres represented in the small dataset so hope isn't lost yet!

In [12]:
# create a datafrace from all_genres
all_genres_df = pd.DataFrame({
    "genre_id": all_genres.index,
    "count": all_genres.values
})
for genre in all_genres_df["genre_id"]:
    title = genres_df.loc[genre]["title"]
    all_genres_df.loc[all_genres_df["genre_id"] == genre, "title"] = title
    parent_id = genres_df.loc[genre]["parent"]
    parent_id = parent_id if not pd.isna(parent_id) and parent_id != 0 else None
    all_genres_df.loc[all_genres_df["genre_id"] == genre, "parent_id"] = parent_id
    parent_title = genres_df.loc[parent_id]["title"] if parent_id else None
    all_genres_df.loc[all_genres_df["genre_id"] == genre, "parent_title"] = parent_title
all_genres_df.head(10)

Unnamed: 0,genre_id,count,title,parent_id,parent_title
0,21,1000,Hip-Hop,,
1,10,1000,Pop,,
2,17,1000,Folk,,
3,38,1000,Experimental,,
4,12,1000,Rock,,
5,2,1000,International,,
6,1235,1000,Instrumental,,
7,15,1000,Electronic,,
8,18,685,Soundtrack,1235.0,Instrumental
9,76,403,Experimental Pop,10.0,Pop


In [13]:
# let's create a tree of genres so we can examine the hierarchy
# we already have a dataframe of all genres with their parent ids
class GenreNode:
    def __init__(self, genre_id: int, title: str):
        self.genre_id = genre_id
        self.title = title
        self.parent: GenreNode | None = None
        self.children: list[GenreNode] = []

    def add_child(self, child_node: "GenreNode") -> None:
        child_node.parent = self
        self.children.append(child_node)

    def __repr__(self) -> str:
        return f"GenreNode(genre_id={self.genre_id}, title='{self.title}')"
class GenreTree:
    def __init__(self):
        self.nodes: dict[int, GenreNode] = {}

    def add_genre(self, genre_id: int, title: str, parent_id: int | None) -> None:
        if genre_id not in self.nodes:
            self.nodes[genre_id] = GenreNode(genre_id, title)
        genre_node = self.nodes[genre_id]
        if parent_id is not None:
            if parent_id not in self.nodes:
                parent_title = genres_df.loc[parent_id]["title"]
                self.nodes[parent_id] = GenreNode(parent_id, parent_title)
            parent_node = self.nodes[parent_id]
            parent_node.add_child(genre_node)

    def get_root_nodes(self) -> list[GenreNode]:
        return [node for node in self.nodes.values() if node.parent is None]
    
    def load(df: pd.DataFrame) -> "GenreTree":
        tree = GenreTree()
        for _, row in df.iterrows():
            tree.add_genre(row["genre_id"], row["title"], row["parent_id"])
        return tree
genre_tree = GenreTree.load(all_genres_df)

In [14]:
# display the genre tree
def display_genre_tree(node: GenreNode, level: int = 0) -> None:
    indent = "  " * level
    print(f"{indent}- {node.title} (id: {node.genre_id})")
    for child in node.children:
        display_genre_tree(child, level + 1)

for root_node in genre_tree.get_root_nodes():
    display_genre_tree(root_node)

- Hip-Hop (id: 21)
  - Rap (id: 539)
  - Alternative Hip-Hop (id: 100)
  - Hip-Hop Beats (id: 811)
  - Breakbeat (id: 542)
  - Nerdcore (id: 83)
  - Abstract Hip-Hop (id: 580)
- Pop (id: 10)
  - Experimental Pop (id: 76)
  - Synth Pop (id: 362)
- Folk (id: 17)
  - Singer-Songwriter (id: 103)
  - Psych-Folk (id: 33)
  - Freak-Folk (id: 94)
  - Free-Folk (id: 49)
  - British Folk (id: 180)
- Experimental (id: 38)
  - Avant-Garde (id: 1)
  - Noise (id: 32)
  - Electroacoustic (id: 41)
  - Field Recordings (id: 30)
  - Sound Collage (id: 224)
  - Unclassifiable (id: 125)
  - Novelty (id: 6)
    - Sound Effects (id: 16)
      - Holiday (id: 763)
        - Christmas (id: 1193)
    - Kid-Friendly (id: 360)
  - Drone (id: 47)
  - Minimalism (id: 456)
  - Sound Art (id: 514)
  - Improv (id: 250)
  - Audio Collage (id: 22)
  - Musique Concrete (id: 247)
  - Sound Poetry (id: 186)
- Rock (id: 12)
  - Punk (id: 25)
    - Hardcore (id: 109)
      - Thrash (id: 361)
    - Power-Pop (id: 111)
    - P

## Start generating triplets

In [15]:
triplets = pd.DataFrame(
    columns=[
        ("anchor", "track_id"),
        ("positive", "track_id"),
        ("negative", "track_id"),
        ("anchor", "genre_top"),
        ("positive", "genre_top"),
        ("negative", "genre_top"),
        ("anchor", "album_id"),
        ("positive", "album_id"),
        ("negative", "album_id"),
        ("anchor", "artist_id"),
        ("positive", "artist_id"),
        ("negative", "artist_id"),
        ("anchor", "genres_all"),
        ("positive", "genres_all"),
        ("negative", "genres_all"),
    ]
)

def triplet_from_tracks(anchor_track: pd.Series, positive_track: pd.Series, negative_track: pd.Series) -> pd.Series:
    return pd.Series({
        ("anchor", "track_id"): anchor_track.name,
        ("anchor", "genre_top"): anchor_track["track", "genre_top"],
        ("anchor", "genres_all"): anchor_track["track", "genres_all"],
        ("anchor", "album_id"): anchor_track["album", "id"],
        ("anchor", "artist_id"): anchor_track["artist", "id"],
        ("positive", "track_id"): positive_track.name,
        ("positive", "genre_top"): positive_track["track", "genre_top"],
        ("positive", "genres_all"): positive_track["track", "genres_all"],
        ("positive", "album_id"): positive_track["album", "id"],
        ("positive", "artist_id"): positive_track["artist", "id"],
        ("negative", "track_id"): negative_track.name,
        ("negative", "genre_top"): negative_track["track", "genre_top"],
        ("negative", "genres_all"): negative_track["track", "genres_all"],
        ("negative", "album_id"): negative_track["album", "id"],
        ("negative", "artist_id"): negative_track["artist", "id"],
    })

In [16]:
# Generate triplets for each top genre
NUM_TRIPLETS_PER_GENRE = 500
random.seed(42)
def generate_triplets_for_genre(genre: str, num_triplets: int = NUM_TRIPLETS_PER_GENRE) -> pd.DataFrame:
    genre_tracks = small_tracks_df[small_tracks_df["track", "genre_top"] == genre]
    other_tracks = small_tracks_df[small_tracks_df["track", "genre_top"] != genre]

    triplets_list = []
    for _ in range(num_triplets):
        anchor = genre_tracks.sample(n=1).iloc[0]
        positive = genre_tracks[genre_tracks.index != anchor.name].sample(n=1).iloc[0]
        negative = other_tracks.sample(n=1).iloc[0]

        triplet = triplet_from_tracks(anchor, positive, negative)
        triplets_list.append(triplet)

    return pd.DataFrame(triplets_list)

generated = 0
for genre in top_genres.index:
    genre_triplets = generate_triplets_for_genre(genre)
    generated += len(genre_triplets)
    triplets = pd.concat([triplets, genre_triplets], ignore_index=True)

print(f"Generated {generated} triplets.")

Generated 4000 triplets.


In [17]:
# Generate triplets for artists with more than 10 songs
ARTIST_SONG_THRESHOLD = 10
NUM_TRIPLETS_PER_ARTIST = 5
random.seed(100)
def generate_triplets_for_artist(artist_id: int, num_triplets: int = NUM_TRIPLETS_PER_ARTIST) -> pd.DataFrame:
    artist_tracks = small_tracks_df[small_tracks_df["artist", "id"] == artist_id]
    other_tracks = small_tracks_df[small_tracks_df["artist", "id"] != artist_id]

    triplets_list = []
    for _ in range(num_triplets):
        anchor = artist_tracks.sample(n=1).iloc[0]
        positive = artist_tracks[artist_tracks.index != anchor.name].sample(n=1).iloc[0]
        negative = other_tracks.sample(n=1).iloc[0]

        triplet = triplet_from_tracks(anchor, positive, negative)
        triplets_list.append(triplet)

    return pd.DataFrame(triplets_list)

artist_track_counts = small_tracks_df["artist", "id"].value_counts()
artist_track_counts = artist_track_counts[artist_track_counts >= ARTIST_SONG_THRESHOLD]
print(f"there are {len(artist_track_counts)} artists with {ARTIST_SONG_THRESHOLD} or more songs")

generated = 0
for artist in artist_track_counts.index:
    artist_triplets = generate_triplets_for_artist(artist)
    generated += len(artist_triplets)
    triplets = pd.concat([triplets, artist_triplets], ignore_index=True)

print(f"Generated {generated} triplets.")

there are 160 artists with 10 or more songs
Generated 800 triplets.


In [18]:
# Generate triplets for albums with more than 10 songs
ALBUM_SONG_THRESHOLD = 10
NUM_TRIPLETS_PER_ALBUM = 5
random.seed(200)
def generate_triplets_for_album(album_id: int, num_triplets: int = NUM_TRIPLETS_PER_ALBUM) -> pd.DataFrame:
    album_tracks = small_tracks_df[small_tracks_df["album", "id"] == album_id]
    other_tracks = small_tracks_df[small_tracks_df["album", "id"] != album_id]

    triplets_list = []
    for _ in range(num_triplets):
        anchor = album_tracks.sample(n=1).iloc[0]
        positive = album_tracks[album_tracks.index != anchor.name].sample(n=1).iloc[0]
        negative = other_tracks.sample(n=1).iloc[0]

        triplet = triplet_from_tracks(anchor, positive, negative)
        triplets_list.append(triplet)

    return pd.DataFrame(triplets_list)

album_track_counts = small_tracks_df["album", "id"].value_counts()
album_track_counts = album_track_counts[album_track_counts >= ALBUM_SONG_THRESHOLD]
print(f"there are {len(album_track_counts)} albums with {ALBUM_SONG_THRESHOLD} or more songs")

generated = 0
for album in album_track_counts.index:
    album_triplets = generate_triplets_for_album(album)
    generated += len(album_triplets)
    triplets = pd.concat([triplets, album_triplets], ignore_index=True)

print(f"Generated {generated} triplets.")

there are 138 albums with 10 or more songs
Generated 690 triplets.


In [19]:
def heuristic_positive_negative(
    anchor: pd.Series, candidate1: pd.Series, candidate2: pd.Series
) -> tuple[pd.Series, pd.Series] | None:
    """
    Given an anchor track and two candidate tracks, determine which candidate is the positive and which is the negative.
    If it can't be determined, return None.
    """
    # positive is the track that shares the most genres with the anchor
    # negative is the track that shares the least genres with the anchor

    # songs in the same genre are very similar
    if (
        candidate1["track", "genre_top"] == anchor["track", "genre_top"]
        and candidate2["track", "genre_top"] != anchor["track", "genre_top"]
    ):
        return candidate1, candidate2
    if (
        candidate2["track", "genre_top"] == anchor["track", "genre_top"]
        and candidate1["track", "genre_top"] != anchor["track", "genre_top"]
    ):
        return candidate2, candidate1

    # songs who have common ancestor genres are more similar
    anchor_genres = set(anchor["track", "genres_all"])
    anchor_ancestors = set()
    for genre_id in anchor_genres:
        node = genre_tree.nodes.get(genre_id)
        while node:
            anchor_ancestors.add(node.genre_id)
            node = node.parent

    # candidate1 shares anchor genres, candidate2 does not
    if anchor_genres.intersection(
        candidate1["track", "genres_all"]
    ) and not anchor_genres.intersection(candidate2["track", "genres_all"]):
        return candidate1, candidate2
    # candidate2 shares anchor genres, candidate1 does not
    if anchor_genres.intersection(
        candidate2["track", "genres_all"]
    ) and not anchor_genres.intersection(candidate1["track", "genres_all"]):
        return candidate2, candidate1

    candidate1_genres = set(candidate1["track", "genres_all"])
    candidate2_genres = set(candidate2["track", "genres_all"])
    ancestors1 = set()
    ancestors2 = set()
    for genre_id in candidate1_genres:
        node = genre_tree.nodes.get(genre_id)
        while node:
            ancestors1.add(node.genre_id)
            node = node.parent
    for genre_id in candidate2_genres:
        node = genre_tree.nodes.get(genre_id)
        while node:
            ancestors2.add(node.genre_id)
            node = node.parent
    # both candidates share anchor genres, compare number of common ancestor genres
    if anchor_genres.intersection(
        candidate1["track", "genres_all"]
    ) and anchor_genres.intersection(candidate2["track", "genres_all"]):
        common_ancestors1 = len(anchor_genres.intersection(ancestors1))
        common_ancestors2 = len(anchor_genres.intersection(ancestors2))
        # candidate1 has more common ancestors
        if common_ancestors1 > common_ancestors2:
            return candidate1, candidate2
        # candidate2 has more common ancestors
        if common_ancestors2 > common_ancestors1:
            return candidate2, candidate1

    # no common genres, so let's check album and artist before looking at common ancestors
    if (
        candidate1["album", "id"] == anchor["album", "id"]
        and candidate2["album", "id"] != anchor["album", "id"]
    ):
        return candidate1, candidate2
    if (
        candidate2["album", "id"] == anchor["album", "id"]
        and candidate1["album", "id"] != anchor["album", "id"]
    ):
        return candidate2, candidate1
    if (
        candidate1["artist", "id"] == anchor["artist", "id"]
        and candidate2["artist", "id"] != anchor["artist", "id"]
    ):
        return candidate1, candidate2
    if (
        candidate2["artist", "id"] == anchor["artist", "id"]
        and candidate1["artist", "id"] != anchor["artist", "id"]
    ):
        return candidate2, candidate1

    candidate1_common_ancestors = len(anchor_ancestors.intersection(ancestors1))
    candidate2_common_ancestors = len(anchor_ancestors.intersection(ancestors2))
    if candidate1_common_ancestors == candidate2_common_ancestors:
        return None  # no basis to choose
    if candidate1_common_ancestors > candidate2_common_ancestors:
        return candidate1, candidate2
    if candidate2_common_ancestors > candidate1_common_ancestors:
        return candidate2, candidate1

    return None  # no basis to choose

In [20]:
# Generate a few more random triplets to reach the desired number of triplets.
# assign the positive and negative samples based on a heuristic.
# don't duplicate existing triplets
# don't include triplets where the anchor is in a triplet already
NUM_TOTAL_TRIPLETS = 8000
random.seed(300)
while len(triplets) < NUM_TOTAL_TRIPLETS:
    anchor = small_tracks_df.sample(n=1).iloc[0]
    others = small_tracks_df[small_tracks_df.index != anchor.name]
    candidate1 = others.sample(n=1).iloc[0]
    others = others[others.index != candidate1.name]
    candidate2 = others.sample(n=1).iloc[0]
    
    if (result := heuristic_positive_negative(anchor, candidate1, candidate2)) is None:
        continue

    positive, negative = result

    triplet = triplet_from_tracks(anchor, positive, negative)

    # check for duplicates
    duplicate = triplets[
        (triplets[("anchor", "track_id")] == triplet[("anchor", "track_id")]) &
        (triplets[("positive", "track_id")] == triplet[("positive", "track_id")]) &
        (triplets[("negative", "track_id")] == triplet[("negative", "track_id")])
    ]
    if not duplicate.empty:
        continue

    # check if anchor is already in a triplet
    if not triplets[
        triplets[("anchor", "track_id")] == triplet[("anchor", "track_id")]
    ].empty:
        continue

    triplets = pd.concat([triplets, pd.DataFrame([triplet])], ignore_index=True)
    
    # print progress
    print(f"Generated {len(triplets)} / {NUM_TOTAL_TRIPLETS} triplets.", end="\r")
    

Generated 8000 / 8000 triplets.

Okay, we've generated our triplets so now it's time to save them to a csv file for later use.

In [21]:
triplets.to_csv(SYNTHETIC_TRIPLETS_FILE, index=False)
print(f"Saved {len(triplets)} triplets to {SYNTHETIC_TRIPLETS_FILE}")

Saved 8000 triplets to ../data/synthetic_triplets.csv
