In [None]:
import sys

sys.path.append("..")

In [None]:
from dotenv import load_dotenv

load_dotenv()

In [None]:
import ast
import json

from typing import Tuple

import pandas as pd
import pytorch_lightning as pl
from datasets import Dataset, DatasetDict, Audio
from sklearn.model_selection import train_test_split
from huggingface_hub import HfApi

from src.constants import METADATA_CSV_PATH, AUDIO_DATA_PATH, REPO_ID

pl.seed_everything(42)

PUSH_TO_HUB = False  # Set to True to push datasets to the Hugging Face Hub

## Load dataset

- Load metadata 
- Clean and preprocess columns
- Push just captions version
- Create 3 versions of audio dataset, 10% subset, 25% subset and 100% subset

In [None]:
metadata_df = pd.read_csv(METADATA_CSV_PATH)
metadata_df

In [None]:
# Load concept-to-tags mapping
CONCEPTS = json.load(open("../data/concepts_to_tags.json", "r"))

print("Available concept categories:")
for cat, tags in CONCEPTS.items():
    print(f"  {cat}: {len(tags)} tags (e.g., {tags[:3]})")

In [None]:
# Create reverse mapping
TAG_TO_CATEGORY = {}
for cat, tags in CONCEPTS.items():
    for tag in tags:
        TAG_TO_CATEGORY[tag] = cat

In [None]:
# Categorize tags in aspect_list column
def categorize_tags(aspect_list):
    tags = ast.literal_eval(aspect_list)
    aspect_list_categorized = {}
    for tag in tags:
        category = TAG_TO_CATEGORY.get(tag)
        category_columns = f"{category}_aspects"
        if category_columns not in aspect_list_categorized:
            aspect_list_categorized[category_columns] = []
        aspect_list_categorized[category_columns].append(tag)
    for category in CONCEPTS.keys():
        category_columns = f"{category}_aspects"
        if category_columns not in aspect_list_categorized:
            aspect_list_categorized[category_columns] = []
    return aspect_list_categorized

In [None]:
metadata_df = pd.concat(
    [
        metadata_df,
        metadata_df["aspect_list"].apply(categorize_tags).apply(pd.Series),
    ],
    axis=1,
)
metadata_df.rename(columns={"prediction": "caption"}, inplace=True)
metadata_df.head()

In [None]:
def create_balanced_splits(
    df: pd.DataFrame,
    train_size: float = 0.7,
    valid_size: float = 0.15,
    test_size: float = 0.15,
    random_state: int = 42
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    train_df, temp_df = train_test_split(
        df,
        train_size=train_size,
        random_state=random_state
    )
    
    valid_ratio = valid_size / (valid_size + test_size)
    valid_df, test_df = train_test_split(
        temp_df,
        train_size=valid_ratio,
        random_state=random_state
    )
    
    return train_df, valid_df, test_df


def create_percentage_subset(
    train_df: pd.DataFrame,
    percentage: float,
    random_state: int = 42
) -> pd.DataFrame:
    subset_df, _ = train_test_split(
        train_df,
        train_size=percentage,
        random_state=random_state
    )
    return subset_df

def prepare_dataset_dict_with_audio(
    train_df: pd.DataFrame,
    valid_df: pd.DataFrame,
    test_df: pd.DataFrame,
    include_audio: bool = False,
    subset_name: str = "100"
) -> DatasetDict:
    """Prepare dataset with optional audio column."""
    columns = [
        "id", "caption", "aspect_list",
        "genre_aspects", "mood_aspects", 
        "instrument_aspects", "tempo_aspects"
    ]
    
    if include_audio:
        columns.append("file_name")
    
    train_clean = train_df[columns].copy()
    valid_clean = valid_df[columns].copy()
    test_clean = test_df[columns].copy()
    
    dataset_dict = DatasetDict({
        "train": Dataset.from_pandas(train_clean, preserve_index=False),
        "validation": Dataset.from_pandas(valid_clean, preserve_index=False),
        "test": Dataset.from_pandas(test_clean, preserve_index=False),
    })
    
    print(f"\nDataset config '{subset_name}' created:")
    print(f"  Train: {len(dataset_dict['train'])} samples")
    print(f"  Valid: {len(dataset_dict['validation'])} samples")
    print(f"  Test:  {len(dataset_dict['test'])} samples")
    print(f"  Audio: {include_audio}")
    
    return dataset_dict

## Create subsets

In [None]:
metadata_no_audio = metadata_df.drop(columns=['file_name'] if 'file_name' in metadata_df.columns else [])

train_df_no_audio, valid_df_no_audio, test_df_no_audio = create_balanced_splits(metadata_no_audio)

train_10_no_audio = create_percentage_subset(train_df_no_audio, percentage=0.10)
valid_10_no_audio = create_percentage_subset(valid_df_no_audio, percentage=0.10)
test_10_no_audio = create_percentage_subset(test_df_no_audio, percentage=0.10)

train_25_no_audio = create_percentage_subset(train_df_no_audio, percentage=0.25)
valid_25_no_audio = create_percentage_subset(valid_df_no_audio, percentage=0.25)
test_25_no_audio = create_percentage_subset(test_df_no_audio, percentage=0.25)

In [None]:
metadata_df['file_name'] = metadata_df.apply(
    lambda row: str(AUDIO_DATA_PATH / row['filename']) if 'filename' in row else None, 
    axis=1
)

train_df_audio, valid_df_audio, test_df_audio = create_balanced_splits(metadata_df)

train_10_audio = create_percentage_subset(train_df_audio, percentage=0.10)
valid_10_audio = create_percentage_subset(valid_df_audio, percentage=0.10)
test_10_audio = create_percentage_subset(test_df_audio, percentage=0.10)

train_25_audio = create_percentage_subset(train_df_audio, percentage=0.25)
valid_25_audio = create_percentage_subset(valid_df_audio, percentage=0.25)
test_25_audio = create_percentage_subset(test_df_audio, percentage=0.25)

In [None]:
dataset_full = prepare_dataset_dict_with_audio(
    train_df_no_audio, valid_df_no_audio, test_df_no_audio, 
    include_audio=False, subset_name="default"
)
dataset_25pct = prepare_dataset_dict_with_audio(
    train_25_no_audio, valid_25_no_audio, test_25_no_audio,
    include_audio=False, subset_name="25pct"
)
dataset_10pct = prepare_dataset_dict_with_audio(
    train_10_no_audio, valid_10_no_audio, test_10_no_audio,
    include_audio=False, subset_name="10pct"
)
dataset_full_audio = prepare_dataset_dict_with_audio(
    train_df_audio, valid_df_audio, test_df_audio,
    include_audio=True, subset_name="audio"
)
dataset_full_audio = dataset_full_audio.cast_column("file_name", Audio())
dataset_25pct_audio = prepare_dataset_dict_with_audio(
    train_25_audio, valid_25_audio, test_25_audio,
    include_audio=True, subset_name="25pct-audio"
)
dataset_25pct_audio = dataset_25pct_audio.cast_column("file_name", Audio())
dataset_10pct_audio = prepare_dataset_dict_with_audio(
    train_10_audio, valid_10_audio, test_10_audio,
    include_audio=True, subset_name="10pct-audio"
)
dataset_10pct_audio = dataset_10pct_audio.cast_column("file_name", Audio())

In [None]:
if PUSH_TO_HUB:
    api = HfApi()
    api.upload_file(
        path_or_fileobj="../DATASET_CARD.md",
        path_in_repo="README.md",
        repo_id=REPO_ID,
        repo_type="dataset",
    )

In [None]:
if PUSH_TO_HUB:
    print(f"\nPushing all configurations to {REPO_ID}...")
    
    print("Pushing 'default' configuration...")
    dataset_full.push_to_hub(REPO_ID, config_name="default", private=False)
    
    print("Pushing '25pct' configuration...")
    dataset_25pct.push_to_hub(REPO_ID, config_name="25pct", private=False)
    
    print("Pushing '10pct' configuration...")
    dataset_10pct.push_to_hub(REPO_ID, config_name="10pct", private=False)
    
    print("Pushing 'audio' configuration (full with audio)...")
    dataset_full_audio.push_to_hub(REPO_ID, config_name="audio", private=False)
    
    print("Pushing '25pct-audio' configuration...")
    dataset_25pct_audio.push_to_hub(REPO_ID, config_name="25pct-audio", private=False)
    
    print("Pushing '10pct-audio' configuration...")
    dataset_10pct_audio.push_to_hub(REPO_ID, config_name="10pct-audio", private=False)