In [1]:
import ast
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter, defaultdict
from datasets import load_dataset

seed = 42
np.random.seed(seed)

## Define Tag Categories

In [2]:
KNOWN_TAGS = json.load(open("../data/concepts_to_tags.json", "r"))

# Reverse map for easy lookup (tag -> category)
TAG_TO_CATEGORY = {}
for cat, tags in KNOWN_TAGS.items():
    for tag in tags:
        TAG_TO_CATEGORY[tag] = cat


## MTG Jaemdo

In [26]:
out_buffer = []

with open("../data/mtg_jamendo/autotagging_top50tags.tsv", "r") as f:
    for line in f.readlines():
        strings = line.strip().split('\t')
        track_id = strings[0]
        tags = strings[5:]  # Assuming tags start from the 6th column
        out_buffer.append({
            "id": track_id,
            "tags": tags
        })
with open("../data/mtg_jamendo/autotagging_top50tags_processed.csv", "w") as f:
    f.write("id,tags\n")
    for item in out_buffer:
        f.write(f"{item['id']},\"{';'.join(item['tags'])}\"\n")

In [27]:
mtg_df = pd.read_csv("../data/mtg_jamendo/autotagging_top50tags_processed.csv", converters={
    'tags': lambda x: x.split(';')
})
mtg_df

Unnamed: 0,id,tags
0,TRACK_ID,[TAGS]
1,track_0000215,[genre---metal]
2,track_0000216,[genre---metal]
3,track_0000219,[genre---metal]
4,track_0000223,[genre---metal]
...,...,...
54376,track_1422056,"[genre---soundtrack, instrument---computer]"
54377,track_1422057,"[genre---soundtrack, instrument---computer]"
54378,track_1422058,"[genre---soundtrack, instrument---computer]"
54379,track_1422059,"[genre---soundtrack, instrument---computer]"


In [28]:
# Parse tags into categories
def parse_tags(tags):
    return pd.Series({
        'genre_tags': [t for t in tags if TAG_TO_CATEGORY[t] == 'genre'],
        'mood_tags': [t for t in tags if TAG_TO_CATEGORY[t] == 'mood'],
        'instrument_tags': [t for t in tags if TAG_TO_CATEGORY[t] == 'instrument']
    })

def clean_tags(tags):
    _tags = ast.literal_eval(str(tags))
    _tags = [t.split('---')[-1].strip() for t in _tags]
    return [t.lower() for t in _tags if t.lower() in TAG_TO_CATEGORY]

mtg_df[['genre_tags', 'mood_tags', 'instrument_tags']] = mtg_df['tags'].apply(clean_tags).apply(parse_tags)
mtg_df['aspect_list'] = mtg_df.apply(lambda row: list(set(
    row['genre_tags'] + row['mood_tags'] + row['instrument_tags']
)), axis=1)
mtg_df

Unnamed: 0,id,tags,genre_tags,mood_tags,instrument_tags,aspect_list
0,TRACK_ID,[TAGS],[],[],[],[]
1,track_0000215,[genre---metal],[metal],[],[],[metal]
2,track_0000216,[genre---metal],[metal],[],[],[metal]
3,track_0000219,[genre---metal],[metal],[],[],[metal]
4,track_0000223,[genre---metal],[metal],[],[],[metal]
...,...,...,...,...,...,...
54376,track_1422056,"[genre---soundtrack, instrument---computer]",[],[],[],[]
54377,track_1422057,"[genre---soundtrack, instrument---computer]",[],[],[],[]
54378,track_1422058,"[genre---soundtrack, instrument---computer]",[],[],[],[]
54379,track_1422059,"[genre---soundtrack, instrument---computer]",[],[],[],[]


In [None]:
mtg_df = mtg_df.where((mtg_df['genre_tags'].map(len) > 0) & (mtg_df['instrument_tags'].map(len) > 0)).dropna()
mtg_df

Unnamed: 0,id,tags,genre_tags,mood_tags,instrument_tags,aspect_list
607,track_0007391,"[genre---electronic, genre---pop, instrument--...","[electronic, pop]",[emotional],"[bass, drums, guitar, keyboard]","[drums, bass, guitar, electronic, emotional, p..."
1015,track_0015161,"[genre---instrumentalpop, genre---pop, genre--...","[pop, rock]",[emotional],"[bass, drums]","[drums, bass, rock, emotional, pop]"
1020,track_0015166,"[genre---dance, genre---electronic, genre---po...","[dance, electronic, pop, techno]",[emotional],[bass],"[bass, electronic, dance, techno, emotional, pop]"
1021,track_0015167,"[genre---chillout, genre---easylistening, genr...","[electronic, pop]",[emotional],"[bass, violin]","[bass, electronic, emotional, pop, violin]"
1023,track_0015169,"[genre---electronic, genre---instrumentalpop, ...","[electronic, pop]",[emotional],"[bass, drums]","[drums, bass, electronic, emotional, pop]"
...,...,...,...,...,...,...
54313,track_1420702,"[genre---dance, genre---easylistening, genre--...",[dance],"[funk, happy]","[bass, drums, keyboard]","[drums, bass, dance, funk, keyboard, happy]"
54314,track_1420704,"[genre---dance, genre---easylistening, instrum...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"
54315,track_1420705,"[genre---dance, genre---easylistening, instrum...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"
54316,track_1420706,"[genre---dance, genre---easylistening, instrum...",[dance],[happy],"[bass, drums, keyboard]","[drums, bass, dance, keyboard, happy]"


In [30]:
mtg_df.to_csv("../data/mtg_jamendo/autotagging_top50tags_processed_cleaned.csv", index=False)

In [None]:
from sklearn.model_selection import train_test_split

df_train, df_valid = train_test_split(mtg_df, test_size=0.1, random_state=42)
df_valid, df_test = train_test_split(df_valid, test_size=0.5, random_state=42)

In [None]:
from pathlib import Path

# Create output directory
output_dir = Path("../data/mtg_jaemdo_tags_dataset/")
output_dir.mkdir(parents=True, exist_ok=True)

df_train.to_csv(output_dir / "train.csv", index=False)
df_valid.to_csv(output_dir / "validation.csv", index=False)
df_test.to_csv(output_dir / "test.csv", index=False)
all_df = pd.concat([df_train, df_valid, df_test])
all_df.to_csv(output_dir / "all.csv", index=False)

In [None]:
data_files = {
    "train": str(output_dir / "train.csv"),
    "validation": str(output_dir / "validation.csv"),
    "test": str(output_dir / "test.csv")
}
dataset = load_dataset("csv", data_files=data_files)
dataset.push_to_hub("bsienkiewicz/mtg-jaemdo-tags-dataset", private=True)

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0): |          |  0.00B /  0.00B            

New Data Upload: |          |  0.00B /  0.00B            

README.md:   0%|          | 0.00/814 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/bsienkiewicz/vae-tags-dataset/commit/c0136e1c1c9d98d142055fe328c7f214ded83b28', commit_message='Upload dataset', commit_description='', oid='c0136e1c1c9d98d142055fe328c7f214ded83b28', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/bsienkiewicz/vae-tags-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='bsienkiewicz/vae-tags-dataset'), pr_revision=None, pr_num=None)

## MSD

In [7]:
msd_df = load_dataset("seungheondoh/LP-MusicCaps-MSD")["train"].to_pandas()
msd_df

Unnamed: 0,track_id,title,artist_name,release,year,tag,caption_writing,caption_summary,caption_paraphrase,caption_attribute_prediction,path
0,TRSAGNY128F425391E,What Are You Fighting For?,Youth Brigade,Sink With Kalifornija,1984,"[cathartic, earnest, punk, urgent, confident, ...","This aggressive, confrontational, and energeti...",This song is an explosive and cathartic anthem...,This alternative indie rock anthem brings a ro...,This alternative indie rock anthem is aggressi...,2/7/2751652.clip.mp3
1,TRPYJDT128F931E7BB,Dope Nose,Weezer,The Lion And The Witch,2002,"[hanging out, giddy, alternative pop rock, roc...",This alternative indie rock song combines gidd...,The song is a playful and fun alternative indi...,Get ready to experience a roller-coaster of em...,This alternative indie rock song has a crunchy...,5/5/5529351.clip.mp3
2,TRJIJTW128F4289A7E,500 Miles Away From Home,Bobby Bare,Super Hits,1988,"[laid back mellow, reflective, dramatic, organ...",This song's Nashville Sound Countrypolitan ble...,"A melancholic, reflective, and bittersweet cou...",This Nashville sound countrypolitan ballad is ...,"This country song has a weary, poignant feel t...",3/3/3320516.clip.mp3
3,TRUZCQS12903CDCFD1,San Antonio Foam Party,Half Man Half Biscuit,Cammell Laird Social Club,2002,"[alternative indie rock, pop rock, indie]",Get lost in the captivating sound of alternati...,An upbeat and energetic indie-pop rock song wi...,This song is a captivating blend of alternativ...,"This indie rock/pop rock song is a unique, exp...",8/5/8539522.clip.mp3
4,TRDYVGM128F146186E,They Rage On,DAN SEALS,Certified Hits,1988,"[reflective, delicate, soft rock, day driving,...",Take a nostalgic journey down memory lane with...,A sentimental and reflective country pop song ...,This song is a sentimental journey through a S...,This pop rock song takes you on a Sunday after...,2/7/279739.clip.mp3
...,...,...,...,...,...,...,...,...,...,...,...
444860,TRSRTSZ128F426826F,Let's Make A Night To Remember,Studio 99,A Tribute To Bryan Adams,0,[pop rock],Get ready to experience the perfect fusion of ...,The song has a catchy beat and upbeat rock ins...,This dynamic pop rock anthem is the perfect fu...,This upbeat pop rock anthem will have you danc...,2/2/2251408.clip.mp3
444861,TRMTMCJ128F92F8A19,Civilization Machine,The Plastic Cloud,The Plastic Cloud,1968,"[psychedelic pop, folk rock, pop rock, psyched...",This mind-bending tune combines the raw energy...,A genre-bending song that blends psychedelic g...,Embodying the vintage sounds of psychedelic ga...,This song is a trippy blend of psychedelic gar...,6/8/6814085.clip.mp3
444862,TRJIIXA128F1490FD8,How Do I Live,James Last,Country Roads,1998,"[big band, laid back mellow, easy listening, j...",This easy listening instrumental pop track is ...,This instrumental pop-jazz song is a laid-back...,This instrumental pop track boasts a laid back...,This instrumental pop song has a laid back mel...,5/3/530277.clip.mp3
444863,TRQPESG128F9300304,Inter-Lergen-Ten-Ko,The Sabres Of Paradise,Sabresonic,1993,"[club dance, electronica, techno, electronic, ...",This electronic techno track incorporates elem...,This electronic song is a combination of techn...,Get lost in the hypnotic rhythms of this elect...,This experimental electronic track combines el...,5/8/5889307.clip.mp3


## MTT

In [11]:
mtt_df = load_dataset("seungheondoh/LP-MusicCaps-MTT")["train"].to_pandas()
mtt_df

README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-28dbf9154d6d52(…):   0%|          | 0.00/5.81M [00:00<?, ?B/s]

data/test-00000-of-00001-94781ef88fa7ed8(…):   0%|          | 0.00/1.65M [00:00<?, ?B/s]

data/valid-00000-of-00001-bf9893b31ca2d5(…):   0%|          | 0.00/558k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/18706 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5329 [00:00<?, ? examples/s]

Generating valid split:   0%|          | 0/1825 [00:00<?, ? examples/s]

Unnamed: 0,track_id,title,artist_name,release,tag_top50,tag_top188,caption_writing,caption_summary,caption_paraphrase,caption_attribute_prediction,pseudo_attribute,path
0,25,Gleichwie der Regen und Schnee vom Himmel fall...,American Bach Soloists,J.S. Bach - Cantatas Volume V,[],[],,,,,[],0/american_bach_soloists-j_s__bach__cantatas_v...
1,29,Gleichwie der Regen und Schnee vom Himmel fall...,American Bach Soloists,J.S. Bach - Cantatas Volume V,"[classical, violin]","[classical, violin]",Immerse yourself in the ethereal world of clas...,A hauntingly beautiful composition showcasing ...,Immerse yourself in the enchanting realm of cl...,This classical piece featuring the enchanting ...,"[orchestra, piano, cello, opera]",0/american_bach_soloists-j_s__bach__cantatas_v...
2,39,Musicalische Exequien SWV 279 Teil I_ Concert ...,American Bach Soloists,Heinrich Schutz - Musicalische Exequien,[slow],[slow],This heartfelt ballad takes the listener on a ...,A melancholic ballad that mesmerizes with its ...,This tender and leisurely track sets a tranqui...,A slow and relaxing song that helps you unwind...,[relaxing],9/american_bach_soloists-heinrich_schutz__musi...
3,40,Musicalische Exequien SWV 279 Teil I_ Concert ...,American Bach Soloists,Heinrich Schutz - Musicalische Exequien,"[opera, male]","[opera, male]",Experience the grandeur and power of a timeles...,A powerful male opera performance that showcas...,"In this captivating opera piece, a male vocali...",This opera is a powerful and dramatic performa...,"[dramatic, aria, tenor, soprano]",9/american_bach_soloists-heinrich_schutz__musi...
4,44,Musicalische Exequien SWV 279 Teil I_ Concert ...,American Bach Soloists,Heinrich Schutz - Musicalische Exequien,[opera],[opera],This song is a powerful and grandiose opera ma...,A powerful operatic ballad that showcases the ...,This song showcases the grandiose and dramatic...,A soaring soprano voice takes center stage in ...,[soprano],9/american_bach_soloists-heinrich_schutz__musi...
...,...,...,...,...,...,...,...,...,...,...,...,...
18701,58899,La Bressanina,Jacob Heringman,Blame Not My Lute,[guitar],"[guitar, classical guitar]",A beautifully crafted song featuring the gentl...,The song features delicate classical guitar me...,The melodic strums of a classical guitar weave...,This song is filled with the entrancing strums...,"[acoustic guitar, electric guitar]",8/jacob_heringman-blame_not_my_lute-56-la_bres...
18702,58906,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,"[guitar, solo, strings, harp, slow, soft]","[guitar, solo, strings, medieval, harp, slow, ...",This hauntingly beautiful ballad features a sl...,"A slow, soft medieval melody featuring a guita...",This ethereal piece features a gentle guitar s...,This slow and soft medieval ballad features a ...,"[acoustic, melancholic, folk, ballad]",8/jacob_heringman-blame_not_my_lute-57-lost_is...
18703,58907,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,"[classical, guitar, strings, slow]","[classical, guitar, strings, slow]",This beautiful classical piece features a gent...,A slow and melancholic piece featuring classic...,This exquisite composition gracefully combines...,This soul-stirring piece of music features slo...,"[melancholic, piano, violin]",8/jacob_heringman-blame_not_my_lute-57-lost_is...
18704,58908,Lost is my Lyberty,Jacob Heringman,Blame Not My Lute,"[classical, guitar, quiet, solo, classic, slow...","[classical, guitar, quiet, solo, classic, mell...",This classical guitar solo is a classic mellow...,A beautiful and melodic solo classical guitar ...,Embrace the timeless beauty of this classic an...,This beautiful classical guitar song is a quie...,"[melancholic, introspective, acoustic, serene]",8/jacob_heringman-blame_not_my_lute-57-lost_is...
