In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import sha2, concat_ws

# Spark session (already active in Databricks, but good practice)
spark = SparkSession.builder.appName("HashFingerprints").getOrCreate()

In [0]:
# Load the flattened CSV
csv_path = "dbfs:/FileStore/silver/flattened_fingerprints_csv/"
df_fingerprints = spark.read.option("header", True).option("inferSchema", True).csv(csv_path)

df_fingerprints.limit(5).show(truncate=False)

+-------------------------------------------------------------+-----+-----+----------+-----------+
|filename                                                     |freq1|freq2|delta_time|anchor_time|
+-------------------------------------------------------------+-----+-----+----------+-----------+
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|3    |3    |21        |17784      |
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|3    |3    |20        |18210      |
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|4    |4    |20        |15066      |
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|4    |4    |37        |15903      |
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|4    |4    |39        |17654      |
+-------------------------------------------------------------+-----+-----+----------+-----------+



In [0]:
# Concatenate fingerprint columns into a string
df_hashed = df_fingerprints.withColumn("fingerprint_string", concat_ws("-", "freq1", "freq2", "delta_time"))

# Apply SHA256 hashing
df_hashed = df_hashed.withColumn("fingerprint_hash", sha2("fingerprint_string", 256))

# Drop unnecessary columns if needed
df_final = df_hashed.select("filename", "freq1", "freq2", "delta_time", "fingerprint_hash")

# Show result
df_final.limit(5).show(truncate=False)

+-------------------------------------------------------------+-----+-----+----------+----------------------------------------------------------------+
|filename                                                     |freq1|freq2|delta_time|fingerprint_hash                                                |
+-------------------------------------------------------------+-----+-----+----------+----------------------------------------------------------------+
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|3    |3    |21        |ad4de2b3fc8d1f2eff326adfad54f125974baeb8cb8371675c25657bb9d11df0|
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|3    |3    |20        |a7d3bfa0105bd7417c6fd6fdf3112fcabff51053116f52c20be19d8199974970|
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|4    |4    |20        |4feff89528864f9953e2ea0ece0425e7c88164d1d707f69256c5eb8aedad5c8a|
|Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3|4    |4    |37        |77

Write to Gold Layer

In [0]:
df_final.write.mode("overwrite").parquet("dbfs:/FileStore/gold/fingerprint_hashes_parq/")