In [None]:
import pandas as pd, os
from pathlib import Path
from mutagen.easyid3 import EasyID3
from mutagen.mp3 import MP3
import soundfile as sf
from pyspark.sql import SparkSession
import essentia.standard as es

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


In [None]:
root = Path("/mnt/d/Songs Dataset")
files = [p for p in root.rglob("*.mp3")]

In [27]:
records = [] 
for p in root.rglob("*.mp3"):
    try:
        audio = MP3(p, ID3 = EasyID3)
        
        artist = audio.get("artist", ["Unknown"])[0]
        title = audio.get("title")[0]
        album = audio.get("album", ["Unknown"])[0]
        duration = audio.info.length  # full duration in seconds
        
    except Exception: # error in retrieval
        artist, title, album, duration = "Unknown", p.stem, "Unknown", None
    
    records.append({
        "track_id": p.stem, # file name, not the id from the meta data
        "path": str(p),
        "artist": artist,
        "title": title,
        "album": album,
        "duration": duration
        })

In [28]:
df = pd.DataFrame(records)
df.to_csv("tracks.csv", index = False)

In [40]:
df

Unnamed: 0,track_id,path,artist,title,album,duration
0,36530,/mnt/d/Songs Dataset/001/036530.mp3,The Tudor Consort,Stabat Mater,Stabat Mater - Domenico Scarlatti,160.287375
1,26947,/mnt/d/Songs Dataset/006/026947.mp3,Thiaz Itch,Froggy Swamp,Binjoum [PRT005],214.047347
2,10045,/mnt/d/Songs Dataset/000/010045.mp3,Skidmore College Orchestra,IX. La Cabane sur des pattes de poule Allegro ...,Mussorgsky's Pictures at an Exhibition,212.809792
3,86081,/mnt/d/Songs Dataset/009/086081.mp3,Barmus Dramfell,One Thousand And Two,Don't Make,106.840825
4,50735,/mnt/d/Songs Dataset/000/050735.mp3,Emerald Park,Obscured By Lies,For Tomorrow (2010 Edition),252.000000
...,...,...,...,...,...,...
2995,48368,/mnt/d/Songs Dataset/003/048368.mp3,Cloudkicker,A Hymn to the Projectile,A New Heavenly Body,254.746667
2996,98515,/mnt/d/Songs Dataset/003/098515.mp3,2Kutup,Grave Is Cold,Split,383.722200
2997,4874,/mnt/d/Songs Dataset/002/004874.mp3,The Conet Project,ready ready,The Conet Project,122.450000
2998,91296,/mnt/d/Songs Dataset/007/091296.mp3,BADLUCK,Love Cool (Cooled Out Remix),Love Cool,223.448277


In [41]:
df["path"] = df["track_id"].apply(lambda x: f"/mnt/d/standard_wav/{x}.wav")


In [42]:
df.sample(3000, random_state=42).to_csv("dev_subset.csv", index=False)

In [64]:
# standardize audio
# Goal: mono, 22,050 Hz, WAV, middle 30s
df = pd.read_csv("tracks.csv")

out_dir = Path("standard_wav")
out_dir.mkdir(exist_ok = True)

for _, r in df.iterrows():
    try: 
        #y ~ arr of samples, sr ~ sample rate
        y, sr = librosa.load(r.path, sr = 22050, mono = True)
        mid = len(y)//2 # mid point of the temporal array
        crop = y[mid - 15*sr: mid + 15*sr] # middle 30 seconds
        sf.write(out_dir/f"{r.track_id}.wav", crop, sr, subtype = "FLOAT")
    except Exception as e: 
        print("skip", r.path, e)
        
    

  y, sr = librosa.load(r.path, sr = 22050, mono = True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


skip D:\Songs Dataset\001\133166.mp3 
skip D:\Songs Dataset\002\111234.mp3 
skip D:\Songs Dataset\002\119436.mp3 
skip D:\Songs Dataset\003\006798.mp3 
skip D:\Songs Dataset\003\006799.mp3 
skip D:\Songs Dataset\003\098569.mp3 
skip D:\Songs Dataset\006\044147.mp3 
skip D:\Songs Dataset\006\098565.mp3 
skip D:\Songs Dataset\006\128794.mp3 
skip D:\Songs Dataset\007\018924.mp3 
skip D:\Songs Dataset\007\026743.mp3 
skip D:\Songs Dataset\009\012491.mp3 
skip D:\Songs Dataset\009\044151.mp3 
skip D:\Songs Dataset\010\012496.mp3 


In [2]:
df = pd.read_csv("dev_subset.csv") 

In [None]:
def extract_features(row):
    import numpy as np
    import essentia.standard as es
    import sys  # For flush-safe logging

    try:
        #print(f"[INFO] Loading: {row['track_id']}", flush=True)
        y = es.MonoLoader(filename=row["path"], sampleRate=22050)()

        # --- Rhythm ---
        #print(f"[INFO] Rhythm -> {row['track_id']}", flush=True)
        rhythm = es.RhythmExtractor2013()(y)
        bpm = rhythm[0]
        beats = rhythm[3]
        tempogram, _ = np.histogram(np.diff(beats), bins=20)

        # --- Chroma ---
        #print(f"[INFO] Chroma -> {row['track_id']}", flush=True)
        frames = es.FrameGenerator(y, frameSize=32768, hopSize=8192, startFromZero=True)
        chromas = np.vstack([es.Chromagram()(f) for f in frames])
        chroma_mean = chromas.mean(axis=0)
        chroma_std = chromas.std(axis=0)

        # --- MFCC ---
        #print(f"[INFO] MFCC -> {row['track_id']}", flush=True)
        frames_mfcc = es.FrameGenerator(y, frameSize=1025, hopSize=512, startFromZero=True)
        mfcc_all = np.vstack([es.MFCC()(f)[1] for f in frames_mfcc])
        mfcc_mean = mfcc_all.mean(axis=0)
        mfcc_std = mfcc_all.std(axis=0)

        # --- Spectral ---
        #print(f"[INFO] Spectral -> {row['track_id']}", flush=True)
        sc = es.Centroid()(y)
        sr = es.RollOff()(y)
        flux = es.Flux()(y)
        zc = es.ZeroCrossingRate()(y)

        # --- RMS ---
        rms = es.RMS()(y)
        rms_mean, rms_std = np.mean(rms), np.std(rms)
        rms_25, rms_50, rms_75 = np.percentile(rms, [25, 50, 75])

        # --- Combine ---
        feature_row = {"track_id": row["track_id"], "bpm": bpm}
        feature_row.update({f"tempogram_{i}": v for i, v in enumerate(tempogram)})
        feature_row.update({f"chroma_mean_{i}": v for i, v in enumerate(chroma_mean)})
        feature_row.update({f"chroma_std_{i}": v for i, v in enumerate(chroma_std)})
        feature_row.update({f"mfcc_mean_{i}": v for i, v in enumerate(mfcc_mean)})
        feature_row.update({f"mfcc_std_{i}": v for i, v in enumerate(mfcc_std)})
        feature_row.update({
            "spectral_centroid": sc, "spectral_rolloff": sr,
            "spectral_flux": flux, "zero_crossing": zc,
            "rms_mean": rms_mean, "rms_std": rms_std,
            "rms_25": rms_25, "rms_50": rms_50, "rms_75": rms_75
        })

        #print(f"[DONE] {row['track_id']}", flush=True)
        return feature_row

    except Exception as e:
        print(f"[ERROR] {row.get('track_id', '?')} -> {e}", flush=True)
        return None


In [4]:
spark = SparkSession.builder.appName("MusicFeatures").master("local[*]").getOrCreate() # run on all available local cores

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/11/10 03:49:37 WARN Utils: Your hostname, DELL-SM, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/11/10 03:49:37 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/10 03:49:38 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
df = pd.read_csv("dev_subset.csv") 
pdf = df  
rdd = spark.sparkContext.parallelize(pdf.to_dict("records"))


In [6]:
features_rdd = rdd.map(extract_features).filter(lambda x: x is not None)

In [7]:
features_df = pd.DataFrame(features_rdd.collect())
features_df.to_parquet("features2.parquet", index=False)

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default]
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[   INFO   ] MusicExtractorSVM: no classifier models were configured by default
[ERROR] 12496 -> Error while configurin

In [3]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import joblib

df = pd.read_parquet("features.parquet")

In [4]:
import numpy as np
X = df.drop(columns=["track_id"]).fillna(0)

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

pca = PCA(n_components=50, random_state=42).fit(X_scaled)
embeddings = pca.transform(X_scaled)
embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True)

np.save("embeddings.npy", embeddings)
joblib.dump(scaler, "scaler.pkl")
joblib.dump(pca, "pca.pkl")


['pca.pkl']

In [5]:
def query_similar(track_id, top_k=5):
    idx = df.index[df["track_id"] == track_id][0]
    query_vec = embeddings[idx].reshape(1, -1)
    sims = cosine_similarity(query_vec, embeddings)[0]
    top_idx = np.argsort(sims)[::-1][1 : top_k + 1]  # skip self
    results = df.iloc[top_idx][["track_id"]].copy()
    results["similarity"] = sims[top_idx]
    return results

In [9]:
query_similar(112858, 5)

Unnamed: 0,track_id,similarity
1572,143068,0.810835
1266,108039,0.799614
2579,140293,0.781048
2761,51835,0.749846
1068,33238,0.743753
