In [101]:
import sys

sys.path.append("..")

In [102]:
import ast
import json

from typing import Tuple

import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict, load_dataset
from sklearn.model_selection import train_test_split

from src.constants import METADATA_CSV_PATH, AUDIO_DATA_PATH

np.random.seed(42)

PUSH_TO_HUB = False  # Set to True to push datasets to the Hugging Face Hub

## Load dataset

- Load metadata 
- Clean and preprocess columns
- Push just captions version
- Create 3 versions of audio dataset, 10% subset, 25% subset and 100% subset

In [103]:
metadata_df = pd.read_csv(METADATA_CSV_PATH)
metadata_df

Unnamed: 0,id,aspect_list,prediction,filename
0,fc382d0ae2b46bd0538115871469af28,"['ambient noises', 'ambient sounds', 'digital ...",This energetic electronic track is marked by i...,fc382d0ae2b46bd0538115871469af28.wav
1,2bb4b92eba27ea6acac7473a6d6b24d0,"['cheerful', 'country music', 'fast tempo', 'n...",This upbeat country-rock tune kicks off with a...,2bb4b92eba27ea6acac7473a6d6b24d0.wav
2,0f698b0297962863c59b64ab2382760c,"['acoustic drums', 'acoustic guitar', 'aggress...",This epic heavy metal track begins with an omi...,0f698b0297962863c59b64ab2382760c.wav
3,5456c0ff1e48d414af58de19a21e2075,"['acoustic drums', 'acoustic guitar', 'bass gu...",This upbeat pop-rock track features a driving ...,5456c0ff1e48d414af58de19a21e2075.wav
4,bf27536a1f56feefc2b606c4305dbe21,"['male voice', 'movie music', 'passionate']",A sweeping orchestral score unfolds over a bac...,bf27536a1f56feefc2b606c4305dbe21.wav
...,...,...,...,...
5353,938aaac2ca856fb6b0cd268bb5633322,"['emotional', 'intense', 'medium tempo', 'mell...","This mid-tempo track unfolds at a steady pace,...",938aaac2ca856fb6b0cd268bb5633322.wav
5354,63dacd2d7e887f90d49dc7614b0dd3b8,"['chaotic', 'danceable', 'energetic', 'piano']",This high-energy track bursts forth from a fra...,63dacd2d7e887f90d49dc7614b0dd3b8.wav
5355,540b04f153f687abb8e6d4d0fd242b2d,"['energetic', 'fast tempo', 'piano', 'upbeat']",This high-energy track unfolds at a rapid-fire...,540b04f153f687abb8e6d4d0fd242b2d.wav
5356,484728dc21e8a87296df0f71db0be3a6,"['acoustic guitar', 'claps', 'dance', 'energet...",This upbeat track kicks off with bouncy acoust...,484728dc21e8a87296df0f71db0be3a6.wav


In [104]:
# Load concept-to-tags mapping
CONCEPTS = json.load(open("../data/concepts_to_tags.json", "r"))

print("Available concept categories:")
for cat, tags in CONCEPTS.items():
    print(f"  {cat}: {len(tags)} tags (e.g., {tags[:3]})")

Available concept categories:
  tempo: 50 tags (e.g., ['medium tempo', 'slow tempo', 'fast tempo'])
  genre: 50 tags (e.g., ['rock', 'pop', 'electronic music'])
  mood: 50 tags (e.g., ['emotional', 'passionate', 'energetic'])
  instrument: 50 tags (e.g., ['acoustic drums', 'electric guitar', 'bass guitar'])


In [105]:
# Create reverse mapping
TAG_TO_CATEGORY = {}
for cat, tags in CONCEPTS.items():
    for tag in tags:
        TAG_TO_CATEGORY[tag] = cat

In [106]:
# Categorize tags in aspect_list column
def categorize_tags(aspect_list):
    tags = ast.literal_eval(aspect_list)
    aspect_list_categorized = {}
    for tag in tags:
        category = TAG_TO_CATEGORY.get(tag)
        category_columns = f"{category}_aspects"
        if category_columns not in aspect_list_categorized:
            aspect_list_categorized[category_columns] = []
        aspect_list_categorized[category_columns].append(tag)
    for category in CONCEPTS.keys():
        category_columns = f"{category}_aspects"
        if category_columns not in aspect_list_categorized:
            aspect_list_categorized[category_columns] = []
    return aspect_list_categorized

In [107]:
metadata_df = pd.concat(
    [
        metadata_df,
        metadata_df["aspect_list"].apply(categorize_tags).apply(pd.Series),
    ],
    axis=1,
)
metadata_df.rename(columns={"prediction": "caption"}, inplace=True)
metadata_df.head()

Unnamed: 0,id,aspect_list,caption,filename,genre_aspects,instrument_aspects,mood_aspects,tempo_aspects
0,fc382d0ae2b46bd0538115871469af28,"['ambient noises', 'ambient sounds', 'digital ...",This energetic electronic track is marked by i...,fc382d0ae2b46bd0538115871469af28.wav,"[ambient noises, ambient sounds, electronic mu...","[digital drums, groovy bass line, keyboard, ke...","[emotional, energetic]",[medium tempo]
1,2bb4b92eba27ea6acac7473a6d6b24d0,"['cheerful', 'country music', 'fast tempo', 'n...",This upbeat country-rock tune kicks off with a...,2bb4b92eba27ea6acac7473a6d6b24d0.wav,"[country music, rock]","[no voice, trumpets]","[cheerful, playful]","[fast tempo, upbeat]"
2,0f698b0297962863c59b64ab2382760c,"['acoustic drums', 'acoustic guitar', 'aggress...",This epic heavy metal track begins with an omi...,0f698b0297962863c59b64ab2382760c.wav,"[movie music, rock music]","[acoustic drums, acoustic guitar, bass, bass g...","[aggressive, gentle, heavy metal, intense]",[medium tempo]
3,5456c0ff1e48d414af58de19a21e2075,"['acoustic drums', 'acoustic guitar', 'bass gu...",This upbeat pop-rock track features a driving ...,5456c0ff1e48d414af58de19a21e2075.wav,"[country music, pop, pop rock, rock]","[acoustic drums, acoustic guitar, bass guitar,...","[cheerful, energetic, fun, happy]","[fast tempo, groovy, medium tempo, upbeat]"
4,bf27536a1f56feefc2b606c4305dbe21,"['male voice', 'movie music', 'passionate']",A sweeping orchestral score unfolds over a bac...,bf27536a1f56feefc2b606c4305dbe21.wav,[movie music],[male voice],[passionate],[]


In [108]:
def create_balanced_splits(
    df: pd.DataFrame,
    train_size: float = 0.7,
    valid_size: float = 0.15,
    test_size: float = 0.15,
    random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_df, temp_df = train_test_split(
        df,
        train_size=train_size,
        random_state=random_state
    )
    
    valid_ratio = valid_size / (valid_size + test_size)
    valid_df, test_df = train_test_split(
        temp_df,
        train_size=valid_ratio,
        random_state=random_state
    )
    
    return train_df, valid_df, test_df


def create_percentage_subset(
    train_df: pd.DataFrame,
    percentage: float,
    random_state: int = 42
) -> pd.DataFrame:
    subset_df, _ = train_test_split(
        train_df,
        train_size=percentage,
        random_state=random_state
    )
    return subset_df

def prepare_dataset_dict(
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    test_df: pd.DataFrame,
    subset_name: str = "100"
) -> DatasetDict:
    columns = [
        "id", "caption", "aspect_list",
        "genre_aspects", "mood_aspects", 
        "instrument_aspects", "tempo_aspects"
    ]
    
    train_clean = train_df[columns].copy()
    valid_clean = valid_df[columns].copy()
    test_clean = test_df[columns].copy()
    
    dataset_dict = DatasetDict({
        "train": Dataset.from_pandas(train_clean, preserve_index=False),
        "validation": Dataset.from_pandas(valid_clean, preserve_index=False),
        "test": Dataset.from_pandas(test_clean, preserve_index=False),
    })
    
    print(f"\nDataset '{subset_name}' created:")
    print(f"  Train: {len(dataset_dict['train'])} samples")
    print(f"  Valid: {len(dataset_dict['validation'])} samples")
    print(f"  Test:  {len(dataset_dict['test'])} samples")
    
    return dataset_dict

## Create balanced subsets based on aspect_list

In [109]:
# Create main splits (70/15/15)
train_df, valid_df, test_df = create_balanced_splits(metadata_df)

print(f"\nSplit sizes:")
print(f"  Train: {len(train_df)} ({len(train_df)/len(metadata_df)*100:.1f}%)")
print(f"  Valid: {len(valid_df)} ({len(valid_df)/len(metadata_df)*100:.1f}%)")
print(f"  Test:  {len(test_df)} ({len(test_df)/len(metadata_df)*100:.1f}%)")


Split sizes:
  Train: 3750 (70.0%)
  Valid: 804 (15.0%)
  Test:  804 (15.0%)


In [110]:
train_10_df = create_percentage_subset(train_df, percentage=0.10)
valid_10_df = create_percentage_subset(valid_df, percentage=0.10)
test_10_df = create_percentage_subset(test_df, percentage=0.10)
train_25_df = create_percentage_subset(train_df, percentage=0.25)
valid_25_df = create_percentage_subset(valid_df, percentage=0.25)
test_25_df = create_percentage_subset(test_df, percentage=0.25)

In [111]:
dataset_10 = prepare_dataset_dict(train_10_df, valid_10_df, test_10_df, subset_name="10")
dataset_25 = prepare_dataset_dict(train_25_df, valid_25_df, test_25_df, subset_name="25")
dataset_100 = prepare_dataset_dict(train_df, valid_df, test_df, subset_name="100")


Dataset '10' created:
  Train: 375 samples
  Valid: 80 samples
  Test:  80 samples

Dataset '25' created:
  Train: 937 samples
  Valid: 201 samples
  Test:  201 samples

Dataset '100' created:
  Train: 3750 samples
  Valid: 804 samples
  Test:  804 samples


In [112]:
if PUSH_TO_HUB:
    dataset_10.push_to_hub(f"bsienkiewicz/ConceptCaps-10pct", private=True)
    dataset_25.push_to_hub(f"bsienkiewicz/ConceptCaps-25pct", private=True)
    dataset_100.push_to_hub(f"bsienkiewicz/ConceptCaps", private=True)

## Prepare Audio version

In [113]:
metadata_df['file_name'] = metadata_df.apply(lambda row: AUDIO_DATA_PATH / row['filename'], axis=1)
metadata_df.drop(columns=['filename'], inplace=True)

In [114]:
train_10_df = metadata_df.loc[metadata_df['id'].isin(dataset_10['train']['id'])]
valid_10_df = metadata_df.loc[metadata_df['id'].isin(dataset_10['validation']['id'])]
test_10_df = metadata_df.loc[metadata_df['id'].isin(dataset_10['test']['id'])]
train_25_df = metadata_df.loc[metadata_df['id'].isin(dataset_25['train']['id'])]
valid_25_df = metadata_df.loc[metadata_df['id'].isin(dataset_25['validation']['id'])]
test_25_df = metadata_df.loc[metadata_df['id'].isin(dataset_25['test']['id'])]
train_df = metadata_df.loc[metadata_df['id'].isin(dataset_100['train']['id'])]
valid_df = metadata_df.loc[metadata_df['id'].isin(dataset_100['validation']['id'])]
test_df = metadata_df.loc[metadata_df['id'].isin(dataset_100['test']['id'])]

In [115]:
dataset_audio_10 = prepare_dataset_dict(train_10_df, valid_10_df, test_10_df, subset_name="10_audio_paths")
dataset_audio_25 = prepare_dataset_dict(train_25_df, valid_25_df, test_25_df, subset_name="25_audio_paths")
dataset_audio_100 = prepare_dataset_dict(train_df, valid_df, test_df, subset_name="100_audio_paths")


Dataset '10_audio_paths' created:
  Train: 603 samples
  Valid: 137 samples
  Test:  134 samples

Dataset '25_audio_paths' created:
  Train: 1448 samples
  Valid: 326 samples
  Test:  351 samples

Dataset '100_audio_paths' created:
  Train: 4248 samples
  Valid: 1232 samples
  Test:  1247 samples


In [116]:
if PUSH_TO_HUB:
    dataset_audio_10.push_to_hub(f"bsienkiewicz/ConceptCaps-10pct-audio_paths", private=True)
    dataset_audio_25.push_to_hub(f"bsienkiewicz/ConceptCaps-25pct-audio_paths", private=True)
    dataset_audio_100.push_to_hub(f"bsienkiewicz/ConceptCaps-audio_paths", private=True)