In [0]:
pip install librosa

Python interpreter will be restarted.
Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
Collecting numpy>=1.22.3
  Downloading numpy-2.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.5 MB)
Collecting numba>=0.51.0
  Downloading numba-0.60.0-cp39-cp39-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.7 MB)
Collecting pooch>=1.1
  Downloading pooch-1.8.2-py3-none-any.whl (64 kB)
Collecting msgpack>=1.0
  Downloading msgpack-1.1.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (377 kB)
Collecting audioread>=2.1.9
  Downloading audioread-3.0.1-py3-none-any.whl (23 kB)
Collecting soundfile>=0.12.1
  Downloading soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl (1.3 MB)
Collecting soxr>=0.3.2
  Downloading soxr-0.5.0.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (253 kB)
Collecting lazy_loader>=0.1
  Downloading lazy_loader-0.4-py3-none-any.whl (12 kB)
Collecting scikit-learn>=1.1.0
  Downloading scikit_learn-1.6.1-cp3

In [0]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, pandas_udf
from pyspark.sql.types import StructType, StructField, ArrayType, DoubleType, FloatType, StringType

import os
import librosa
import numpy as np
import pandas as pd

spark = SparkSession.builder.appName("FingerprintExtraction").getOrCreate()

In [0]:
from pyspark.sql.functions import input_file_name
import os

# Read list of MP3 files in Bronze layer
mp3_path = "dbfs:/FileStore/bronze/mp3/"

# Create DataFrame from file paths
df_mp3_files = spark.read.format("binaryFile").load(mp3_path)
df_mp3_files = df_mp3_files.withColumn("file_name", input_file_name())
df_mp3_files.select("file_name").show(truncate=False)

+----------------------------------------------------------------------------------------------+
|file_name                                                                                     |
+----------------------------------------------------------------------------------------------+
|dbfs:/FileStore/bronze/mp3/Kendrick_Lamar___Not_Like_Us.mp3                                   |
|dbfs:/FileStore/bronze/mp3/Chappell_Roan___Pink_Pony_Club__Official_Music_Video_.mp3          |
|dbfs:/FileStore/bronze/mp3/Lady_Gaga__Bruno_Mars___Die_With_A_Smile__Official_Music_Video_.mp3|
|dbfs:/FileStore/bronze/mp3/PARTYNEXTDOOR___DRAKE___NOKIA__Official_Lyric_Video_.mp3           |
|dbfs:/FileStore/bronze/mp3/Billie_Eilish___BIRDS_OF_A_FEATHER__Official_Music_Video_.mp3      |
|dbfs:/FileStore/bronze/mp3/Kendrick_Lamar___tv_off__Official_Audio_.mp3                       |
|dbfs:/FileStore/bronze/mp3/Teddy_Swims___Lose_Control__The_Village_Sessions_.mp3              |
|dbfs:/FileStore/bronze/mp3/Ke

Define Fingerprint Extraction Function

In [0]:
import librosa
import numpy as np

def extract_fingerprint_from_path(local_path, top_percentile=95):
    try:
        y, sr = librosa.load(local_path, sr=44100)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
        threshold = np.percentile(D, top_percentile)
        peaks = np.argwhere(D > threshold)  # (freq_bin, time_bin)

        # Convert to flattened string format
        flattened = [f"{int(f)},{int(t)}" for f, t in peaks]
        return flattened
    except Exception as e:
        return [f"error: {str(e)}"]

Register Spark UDF

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType

extract_fingerprint_udf = udf(extract_fingerprint_from_path, ArrayType(StringType()))

Apply Fingerprint Extraction

In [0]:
from pyspark.sql.functions import expr

# Convert Spark DBFS path to real file path for librosa
df_with_local_path = df_mp3_files.withColumn("local_path", expr("translate(file_name, 'dbfs:/', '/dbfs/')"))

# Apply fingerprint UDF
df_fingerprinted = df_with_local_path.withColumn("fingerprints", extract_fingerprint_udf("local_path"))

df_fingerprinted.select("file_name", "fingerprints").show(truncate=False)

+----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|file_name                                                                                     |fingerprints                                                                                                                                  |
+----------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------+
|dbfs:/FileStore/bronze/mp3/Kendrick_Lamar___Not_Like_Us.mp3                                   |[error: [Errno 2] No such file or directory: '/dbfs/FileStore/dronze/mp3/Ken/rick_Lamar___Not_Like_Uf.mp3']                                   |
|dbfs:/FileStore/bronze/mp3/Chappell_Roa

Explode Fingerprints & Column Cleaning

In [0]:
from pyspark.sql.functions import explode, split

df_flat = df_fingerprinted.withColumn("fingerprint", explode("fingerprints"))
df_flat = df_flat.withColumn("freq_bin", split("fingerprint", ",")[0].cast("int"))
df_flat = df_flat.withColumn("time_bin", split("fingerprint", ",")[1].cast("int"))

df_flat = df_flat.select("file_name", "freq_bin", "time_bin")
df_flat.show(truncate=False)

+----------------------------------------------------------------------------------------------+--------+--------+
|file_name                                                                                     |freq_bin|time_bin|
+----------------------------------------------------------------------------------------------+--------+--------+
|dbfs:/FileStore/bronze/mp3/Kendrick_Lamar___Not_Like_Us.mp3                                   |null    |null    |
|dbfs:/FileStore/bronze/mp3/Chappell_Roan___Pink_Pony_Club__Official_Music_Video_.mp3          |null    |null    |
|dbfs:/FileStore/bronze/mp3/Lady_Gaga__Bruno_Mars___Die_With_A_Smile__Official_Music_Video_.mp3|null    |null    |
|dbfs:/FileStore/bronze/mp3/PARTYNEXTDOOR___DRAKE___NOKIA__Official_Lyric_Video_.mp3           |null    |null    |
|dbfs:/FileStore/bronze/mp3/Billie_Eilish___BIRDS_OF_A_FEATHER__Official_Music_Video_.mp3      |null    |null    |
|dbfs:/FileStore/bronze/mp3/Kendrick_Lamar___tv_off__Official_Audio_.mp3        

Write Flattened Fingerprints

In [0]:
df_flat.write.mode("overwrite").format("delta").save("dbfs:/FileStore/silver/flattened_fingerprints/")

In [0]:
df_flat.write.mode("overwrite").format("delta").save("dbfs:/FileStore/silver/flattened_fingerprints/")
df_flat.write.mode("overwrite").option("header", True).csv("dbfs:/FileStore/silver/flattened_fingerprints_csv/")

In [0]:
spark.stop()