# Generate Tags Dataset for Caption Creation

This notebook generates a synthetic dataset of music tags for caption creation. It selects random tags from predefined categories (instruments, mood, tempo, genre) with probabilities based on co-occurrence patterns in the MusicCaps dataset.

In [32]:
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
import itertools
import random
from datasets import load_dataset

## Load Existing MusicCaps Data

Load the existing tags dataset to analyze tag co-occurrence patterns.

In [33]:
ds = load_dataset("google/MusicCaps")
df = ds['train'].to_pandas()
df['aspect_list_transformed'] = df['aspect_list'].apply(lambda x: x.strip("[]").replace("'", ""))
df['aspect_list_transformed'] = df['aspect_list_transformed'].apply(lambda x: x.split(', '))

## Define Tag Categories

Define all possible tags for each category based on the dataset.

In [34]:
tempo_tags = [
    "fast tempo", "medium tempo", "slow tempo", "moderate tempo", "uptempo",
    "medium fast tempo", "slower tempo", "medium to uptempo", "mid-tempo",
    "quick tempo", "accelerated tempo", "steady tempo", "rapid tempo",
    "slow music", "very fast tempo", "slow to medium tempo", "medium-to-high pitch singing",
    "steady drumming rhythm", "dance rhythm", "various tempos", "tempo changes",
    "fast paced", "slow song", "mid tempo", "steady beat", "pulsating beats",
    "groovy rhythm", "4 on the floor kick pattern", "normal tempo", "fast beat"
]

genre_tags = [
    "rock", "pop", "jazz", "classical", "folk", "blues", "hip hop", "reggae",
    "metal", "country", "r&b", "edm", "trance", "techno", "dance music",
    "electronic dance music", "gospel", "ambient", "soul", "funk",
    "alternative rock", "ballad", "hip-hop", "techno pop", "world music",
    "disco", "trap", "punk rock", "latin pop", "house", "bluegrass",
    "indie rock", "new age", "grunge", "industrial", "dubstep",
    "carnatic music", "bossa nova", "baroque music", "surf rock",
    "ska", "lo-fi", "symphonic", "orchestral", "fusion music", "raga",
    "bollywood music", "afrobeat", "folk song", "christian rock", "soundtrack"
]

mood_tags = [
    "emotional", "passionate", "happy", "melancholic", "relaxing", "calming",
    "upbeat", "exciting", "mellow", "sentimental", "soothing", "joyful",
    "intense", "peaceful", "dreamy", "romantic mood", "ominous", "suspenseful",
    "haunting", "energetic", "chill", "cheerful", "nostalgic", "fun",
    "cool", "ethereal", "sad", "spooky", "hopeful", "playful",
    "mystical", "dark", "solemn", "festive", "inspirational", "sentimental",
    "powerful", "serene", "mysterious", "emphatic", "tranquil", "passionate singing",
    "ominous music", "romantic", "meditative", "joyous", "heartfelt", "uplifting",
    "enthusiastic", "melancholy", "emotional voice", "soothing melody", "heavenly", 
    "fearful", "vibrant", "soulful", "excited", "energetic drums", "charming"
]

instrument_tags = [
    "piano", "drums", "guitar", "bass guitar", "electric guitar", "acoustic guitar",
    "flute", "violin", "cello", "trumpet", "saxophone", "tambourine",
    "synth", "harmonica", "organ", "harp", "clarinet", "string section",
    "percussion", "banjo", "trombone", "didgeridoo", "mandolin", "tabla",
    "ukulele", "accordion", "xylophone", "viola", "timpani", "congas",
    "bongo", "triangle", "oboe", "bagpipes", "steel drums", "marimba",
    "dj mixer", "drum machine", "brass section", "horn", "sitar",
    "strings", "keyboard", "double bass", "synth bass", "guitar solo",
    "electric piano", "acoustic piano", "woodwind", "cymbals", "bells",
    "vibraphone", "hand claps", "snare", "hi-hat", "kick drum", 
    "conga", "tabla percussion", "theremin", "church organ", "trumpets",
    "bass drum", "djembe", "steel guitar", "harpsichord", "choir"
]

In [35]:
def extract_tags(song_tags, concept_tags):
    res = []
    for c_tag in concept_tags:
        for s_tag in song_tags:
            if c_tag in s_tag:
                res.append(s_tag)
    return list(set(res))

In [36]:
concepts = {
    "tempo": tempo_tags,
    "genre": genre_tags,
    "mood": mood_tags,
    "instrument": instrument_tags
}

for concept, tags in concepts.items():
    df[concept + '_tags'] = df['aspect_list_transformed'].apply(
        lambda x: extract_tags(x, tags)
    )

## Analyze Tag Statistics

Calculate individual tag frequencies and co-occurrence patterns.

In [37]:
# Calculate tag frequencies
instrument_counts = Counter()
mood_counts = Counter()
genre_counts = Counter()
tempo_counts = Counter()

for _, row in df.iterrows():
        for tag in row['instrument_tags']:
            instrument_counts[tag.strip()] += 1
        for tag in row['mood_tags']:
            mood_counts[tag.strip()] += 1
        for tag in row['genre_tags']:
            genre_counts[tag.strip()] += 1
        for tag in row['tempo_tags']:
            tempo_counts[tag.strip()] += 1

print("Top instruments:", instrument_counts.most_common(5))
print("Top moods:", mood_counts.most_common(5))
print("Top genres:", genre_counts.most_common(5))
print("Top tempos:", tempo_counts.most_common(5))

Top instruments: [('acoustic drums', 393), ('bass guitar', 297), ('electric guitar', 297), ('acoustic guitar', 290), ('piano', 239)]
Top moods: [('emotional', 620), ('energetic', 593), ('passionate', 592), ('happy', 225), ('upbeat', 181)]
Top genres: [('rock', 207), ('pop', 197), ('dance music', 117), ('classical', 111), ('soulful', 101)]
Top tempos: [('medium tempo', 595), ('slow tempo', 473), ('fast tempo', 381), ('uptempo', 247), ('moderate tempo', 224)]


In [38]:
# Calculate number of instruments per sample
instrument_num_counts = Counter()
for _, row in df.iterrows():
    num_instruments = len([tag.strip() for tag in str(row['instrument_tags']).split(',')])
    instrument_num_counts[num_instruments] += 1

print("Distribution of number of instruments per sample:")
for num, count in sorted(instrument_num_counts.items()):
    print(f"  {num} instruments: {count} samples ({count/len(df)*100:.1f}%)")

Distribution of number of instruments per sample:
  1 instruments: 1964 samples (35.6%)
  2 instruments: 1557 samples (28.2%)
  3 instruments: 1119 samples (20.3%)
  4 instruments: 608 samples (11.0%)
  5 instruments: 185 samples (3.4%)
  6 instruments: 64 samples (1.2%)
  7 instruments: 15 samples (0.3%)
  8 instruments: 7 samples (0.1%)
  9 instruments: 1 samples (0.0%)
  10 instruments: 1 samples (0.0%)


In [39]:
# Calculate number of moods per sample
mood_num_counts = Counter()
for _, row in df.iterrows():
    num_moods = len([tag.strip() for tag in str(row['mood_tags']).split(',')])
    mood_num_counts[num_moods] += 1

print("Distribution of number of moods per sample:")
for num, count in sorted(mood_num_counts.items()):
    print(f"  {num} moods: {count} samples ({count/len(df)*100:.1f}%)")

Distribution of number of moods per sample:
  1 moods: 3548 samples (64.3%)
  2 moods: 918 samples (16.6%)
  3 moods: 606 samples (11.0%)
  4 moods: 281 samples (5.1%)
  5 moods: 102 samples (1.8%)
  6 moods: 44 samples (0.8%)
  7 moods: 17 samples (0.3%)
  8 moods: 3 samples (0.1%)
  9 moods: 2 samples (0.0%)


In [40]:
# Calculate number of genres per sample
genre_num_counts = Counter()
for _, row in df.iterrows():
    num_genres = len([tag.strip() for tag in str(row['genre_tags']).split(',')])
    genre_num_counts[num_genres] += 1

print("Distribution of number of genres per sample:")
for num, count in sorted(genre_num_counts.items()):
    print(f"  {num} genres: {count} samples ({count/len(df)*100:.1f}%)")

Distribution of number of genres per sample:
  1 genres: 4405 samples (79.8%)
  2 genres: 585 samples (10.6%)
  3 genres: 246 samples (4.5%)
  4 genres: 142 samples (2.6%)
  5 genres: 72 samples (1.3%)
  6 genres: 39 samples (0.7%)
  7 genres: 19 samples (0.3%)
  8 genres: 5 samples (0.1%)
  9 genres: 5 samples (0.1%)
  10 genres: 2 samples (0.0%)
  12 genres: 1 samples (0.0%)


In [41]:
# Calculate number of tempos per sample
tempo_num_counts = Counter()
for _, row in df.iterrows():
    num_tempos = len([tag.strip() for tag in str(row['tempo_tags']).split(',')])
    tempo_num_counts[num_tempos] += 1

print("Distribution of number of tempos per sample:")
for num, count in sorted(tempo_num_counts.items()):
    print(f"  {num} tempos: {count} samples ({count/len(df)*100:.1f}%)")

Distribution of number of tempos per sample:
  1 tempos: 5340 samples (96.7%)
  2 tempos: 147 samples (2.7%)
  3 tempos: 32 samples (0.6%)
  4 tempos: 1 samples (0.0%)
  5 tempos: 1 samples (0.0%)


## Build Probability Distributions

Create weighted probability distributions based on tag frequencies.

In [26]:
# Convert counts to probability distributions
def counts_to_probs(counts, tags):
    """Convert counts to normalized probability distribution."""
    total = sum(counts.values())
    return {tag: counts.get(tag, 1) / total for tag in tags}  # Add 1 for unseen tags

instrument_probs = counts_to_probs(instrument_counts, instrument_tags)
mood_probs = counts_to_probs(mood_counts, mood_tags)
genre_probs = counts_to_probs(genre_counts, genre_tags)
tempo_probs = counts_to_probs(tempo_counts, tempo_tags)

# Distribution for number of instruments
total_samples = len(df)
num_instruments_probs = {num: count/total_samples for num, count in instrument_num_counts.items()}
num_moods_probs = {num: count/total_samples for num, count in mood_num_counts.items()}
num_genres_probs = {num: count/total_samples for num, count in genre_num_counts.items()}
num_tempos_probs = {num: count/total_samples for num, count in tempo_num_counts.items()}

print("Probability distributions created")

Probability distributions created


## Generate New Tags Dataset

Generate new samples with realistic tag combinations.

In [27]:
def sample_tags(tags, probs, num_samples=1, temperature=1.0):
    """Sample tags with temperature-controlled randomness.
    
    Args:
        tags: List of available tags
        probs: Dictionary of tag probabilities
        num_samples: Number of tags to sample
        temperature: Controls randomness (lower = more deterministic, higher = more random)
    """
    # Apply temperature to probabilities
    prob_values = np.array([probs[tag] for tag in tags])
    prob_values = prob_values ** (1 / temperature)
    prob_values = prob_values / prob_values.sum()
    
    selected = np.random.choice(tags, size=num_samples, replace=False, p=prob_values)
    return selected.tolist()

def sample_num_from_distribution(num_probs, temperature=1.0):
    """Sample a number from a discrete distribution with temperature control.
    
    Args:
        num_probs: Dictionary mapping numbers to their probabilities
        temperature: Controls randomness (lower = more deterministic, higher = more random)
    """
    nums = list(num_probs.keys())
    prob_values = np.array(list(num_probs.values()))
    prob_values = prob_values ** (1 / temperature)
    prob_values = prob_values / prob_values.sum()
    
    selected_num = np.random.choice(nums, p=prob_values)
    return selected_num

def generate_sample(variety_factor=0.5):
    """Generate a single sample with tags.
    
    Args:
        variety_factor: Controls randomness (0 = deterministic, 1 = very random)
    """
    # Temperature increases with variety_factor
    temp = 1.0 + variety_factor * 2.0
    
    num_instruments = sample_num_from_distribution(num_instruments_probs, temp)
    num_moods = sample_num_from_distribution(num_moods_probs, temp)
    num_genres = sample_num_from_distribution(num_genres_probs, temp)
    num_tempos = sample_num_from_distribution(num_tempos_probs, temp)
    
    # Sample tags from each category
    selected_instruments = sample_tags(instrument_tags, instrument_probs, num_instruments, temp)
    selected_mood = sample_tags(mood_tags, mood_probs, num_moods, temp)
    selected_genre = sample_tags(genre_tags, genre_probs, num_genres, temp)
    selected_tempo = sample_tags(tempo_tags, tempo_probs, num_tempos, temp)
    
    # Combine all tags
    all_tags = selected_instruments + [selected_mood, selected_genre, selected_tempo]
    
    return {
        'instrument_tags': ', '.join(selected_instruments),
        'mood_tags': ', '.join(selected_mood),
        'genre_tags': ', '.join(selected_genre),
        'tempo_tags': ', '.join(selected_tempo),
        'all_tags': ', '.join(all_tags)
    }

# Test generation
print("Sample with low variety:")
print(generate_sample(variety_factor=0.2))
print("\nSample with high variety:")
print(generate_sample(variety_factor=0.8))

Sample with low variety:
{'instrument_tags': 'harmonica', 'mood_tags': 'upbeat', 'genre_tags': 'rock', 'tempo_tags': 'medium-to-high pitch singing', 'all_tags': 'harmonica, upbeat, rock, medium-to-high pitch singing'}

Sample with high variety:
{'instrument_tags': 'vibraphone, guitar solo, electric guitar', 'mood_tags': 'uplifting', 'genre_tags': 'hip hop', 'tempo_tags': 'slow tempo', 'all_tags': 'vibraphone, guitar solo, electric guitar, uplifting, hip hop, slow tempo'}


In [28]:
# Generate dataset with varying variety
def generate_dataset(num_samples=1000, seed=42):
    """Generate a complete tags dataset."""
    np.random.seed(seed)
    random.seed(seed)
    
    samples = []
    for i in range(num_samples):
        # Vary the variety factor across samples
        variety = random.uniform(0.2, 0.8)
        sample = generate_sample(variety_factor=variety)
        sample['sample_id'] = f"generated_sample_{i:04d}"
        samples.append(sample)
    
    return pd.DataFrame(samples)

# Generate datasets
train_size = 800
val_size = 100
test_size = 100

print(f"Generating {train_size} training samples...")
train_df = generate_dataset(train_size, seed=42)

print(f"Generating {val_size} validation samples...")
val_df = generate_dataset(val_size, seed=43)

print(f"Generating {test_size} test samples...")
test_df = generate_dataset(test_size, seed=44)

print("\nDataset generation complete!")
print(f"Train: {len(train_df)} samples")
print(f"Validation: {len(val_df)} samples")
print(f"Test: {len(test_df)} samples")

Generating 800 training samples...
Generating 100 validation samples...
Generating 100 test samples...
Generating 100 validation samples...
Generating 100 test samples...

Dataset generation complete!
Train: 800 samples
Validation: 100 samples
Test: 100 samples

Dataset generation complete!
Train: 800 samples
Validation: 100 samples
Test: 100 samples


## Preview Generated Data

In [29]:
print("First 10 training samples:")
train_df.head(10)

First 10 training samples:


Unnamed: 0,instrument_tags,mood_tags,genre_tags,tempo_tags,all_tags,sample_id
0,"bass drum, keyboard",ethereal,classical,medium tempo,"bass drum, keyboard, ethereal, classical, medi...",generated_sample_0000
1,"bells, timpani",dark,rock,groovy rhythm,"bells, timpani, dark, rock, groovy rhythm",generated_sample_0001
2,acoustic guitar,melancholic,classical,slow tempo,"acoustic guitar, melancholic, classical, slow ...",generated_sample_0002
3,percussion,sentimental,alternative rock,medium tempo,"percussion, sentimental, alternative rock, med...",generated_sample_0003
4,"tambourine, string section, keyboard",melancholic,dance music,uptempo,"tambourine, string section, keyboard, melancho...",generated_sample_0004
5,"congas, electric guitar",passionate,bollywood music,groovy rhythm,"congas, electric guitar, passionate, bollywood...",generated_sample_0005
6,tambourine,passionate,world music,uptempo,"tambourine, passionate, world music, uptempo",generated_sample_0006
7,"tabla, drums",melancholy,reggae,steady tempo,"tabla, drums, melancholy, reggae, steady tempo",generated_sample_0007
8,"tabla, accordion, acoustic guitar",soulful,house,groovy rhythm,"tabla, accordion, acoustic guitar, soulful, ho...",generated_sample_0008
9,"bagpipes, church organ, bass guitar, violin",passionate,r&b,uptempo,"bagpipes, church organ, bass guitar, violin, p...",generated_sample_0009


In [30]:
# Analyze generated dataset statistics
print("Generated dataset statistics:")
print("\nInstrument distribution:")
instrument_gen_counts = Counter()
for _, row in train_df.iterrows():
    for tag in row['instrument_tags'].split(', '):
        instrument_gen_counts[tag.strip()] += 1
print(instrument_gen_counts.most_common(10))

print("\nMood distribution:")
print(train_df['mood_tags'].value_counts().head(10))

print("\nGenre distribution:")
print(train_df['genre_tags'].value_counts().head(10))

print("\nTempo distribution:")
print(train_df['tempo_tags'].value_counts())

print("\nNumber of instruments per sample:")
instrument_counts_gen = train_df['instrument_tags'].apply(lambda x: len(x.split(', '))).value_counts().sort_index()
for num, count in instrument_counts_gen.items():
    print(f"  {num} instruments: {count} samples ({count/len(train_df)*100:.1f}%)")

Generated dataset statistics:

Instrument distribution:
[('bass guitar', 90), ('electric guitar', 77), ('acoustic guitar', 67), ('percussion', 65), ('piano', 63), ('keyboard', 52), ('flute', 43), ('acoustic piano', 40), ('strings', 40), ('drums', 38)]

Mood distribution:
mood_tags
emotional      47
energetic      46
passionate     40
sentimental    32
upbeat         28
romantic       28
exciting       28
fun            27
happy          26
mellow         26
Name: count, dtype: int64

Genre distribution:
genre_tags
pop            41
rock           40
dance music    36
classical      35
edm            30
hip hop        28
jazz           25
reggae         25
hip-hop        25
metal          23
Name: count, dtype: int64

Tempo distribution:
tempo_tags
medium tempo                    106
fast tempo                      103
slow tempo                       88
uptempo                          72
moderate tempo                   57
medium fast tempo                57
medium to uptempo         

## Save Generated Dataset

In [None]:
# Create output directory
output_dir = Path("../data/generated_tags")
output_dir.mkdir(parents=True, exist_ok=True)

# Save datasets
train_df.to_csv(output_dir / "train.csv", index=False)
val_df.to_csv(output_dir / "validation.csv", index=False)
test_df.to_csv(output_dir / "test.csv", index=False)

all_df = pd.concat([train_df, val_df, test_df])
all_df.to_csv(output_dir / "all.csv", index=False)

print(f"Datasets saved to {output_dir}")
print(f"  - train.csv: {len(train_df)} samples")
print(f"  - validation.csv: {len(val_df)} samples")
print(f"  - test.csv: {len(test_df)} samples")

Datasets saved to ..\data\generated_tags
  - train.csv: 800 samples
  - validation.csv: 100 samples
  - test.csv: 100 samples


## Upload to huggingface

In [None]:
data_files = {
    "train": output_dir / "train.csv",
    "validation": output_dir / "validation.csv",
    "test": output_dir / "test.csv",
    "all": output_dir / "all.csv"
}
dataset = load_dataset("csv", data_files=data_files)
dataset.push_to_hub("bsienkiewicz/random-tags-dataset", private=True)

## Summary

This notebook:
1. Analyzed tag co-occurrence patterns in the MusicCaps dataset
2. Built probability distributions for each tag category
3. Generated synthetic tag combinations with realistic distributions
4. Introduced variety through temperature-controlled sampling
5. Saved train/validation/test splits for caption generation