# ZASHAM APP  
**Zasham** is a music recognition system inspired by the popular app Shazam, which can identify a song just by "listening" to a short audio clip. The goal of our project is to replicate the core idea behind Shazam and build a simplified version of this system using Python, Librosa, and Apache Spark on Databricks.

### Install Libraries

In [0]:
!pip install librosa
!pip install yt-dlp

Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl (260 kB)
[?25l[K     |█▎                              | 10 kB 28.3 MB/s eta 0:00:01[K     |██▌                             | 20 kB 4.2 MB/s eta 0:00:01[K     |███▊                            | 30 kB 6.1 MB/s eta 0:00:01[K     |█████                           | 40 kB 5.9 MB/s eta 0:00:01[K     |██████▎                         | 51 kB 6.0 MB/s eta 0:00:01[K     |███████▌                        | 61 kB 7.1 MB/s eta 0:00:01[K     |████████▉                       | 71 kB 6.7 MB/s eta 0:00:01[K     |██████████                      | 81 kB 7.3 MB/s eta 0:00:01[K     |███████████▎                    | 92 kB 7.6 MB/s eta 0:00:01[K     |████████████▋                   | 102 kB 7.8 MB/s eta 0:00:01[K     |█████████████▉                  | 112 kB 7.8 MB/s eta 0:00:01[K     |███████████████                 | 122 kB 7.8 MB/s eta 0:00:01[K     |████████████████▍               | 133 kB 7.8 MB/s eta 0:00:01[

### Create Spark Session

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("ClipFingerprinting").getOrCreate()

### UDF Functions

In [0]:
import os
import librosa
import numpy as np
import soundfile as sf
import hashlib
import pandas as pd
import yt_dlp

# Paths
download_path = "/dbfs/FileStore/staging/clips/"
fingerprint_csv_path = "/dbfs/FileStore/staging/fingerprint_clips/"

# Ensure folders exist
os.makedirs(download_path, exist_ok=True)
os.makedirs(fingerprint_csv_path, exist_ok=True)



Download Song Function

In [0]:
def download_mp3_clip(song_name, download_folder=download_path):
    try:
        ydl_opts = {
            'format': 'bestaudio/best',
            'default_search': 'ytsearch1',
            'noplaylist': True,
            'outtmpl': f'{download_folder}/%(title)s.%(ext)s',
            'postprocessors': [{
                'key': 'FFmpegExtractAudio',
                'preferredcodec': 'mp3',
                'preferredquality': '192',
            }],
            'quiet': False
        }

        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(song_name, download=True)
            filename = ydl.prepare_filename(info).replace(".webm", ".mp3")
            print(f"Downloaded: {filename}")
            return filename
    except Exception as e:
        print(f"Download error: {e}")
        return None

Extract Clip Function

In [0]:
import soundfile as sf

def extract_clip(input_mp3_path, output_clip_path, start_time=10, duration=10):
    try:
        y, sr = librosa.load(input_mp3_path, sr=44100, offset=start_time, duration=duration)
        sf.write(output_clip_path, y, sr)
        print(f"Clip saved: {output_clip_path}")
        return output_clip_path
    except Exception as e:
        print(f"Clip extraction error: {e}")
        return None

Generate Flattened Fingerprint Function

In [0]:
def generate_flattened_fingerprint(clip_path):
    try:
        y, sr = librosa.load(clip_path, sr=44100)
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
        peaks = np.argwhere(D > np.percentile(D, 95))  # freq_bin, time_bin

        flattened = [{"filename": os.path.basename(clip_path), "freq_bin": int(f), "time_bin": int(t)} for f, t in peaks]
        return flattened
    except Exception as e:
        print(f"Fingerprint error: {e}")
        return []

Hashing Function

In [0]:
def hash_fingerprints(flattened_fingerprint):
    hashed = []
    for row in flattened_fingerprint:
        hash_input = f"{row['freq_bin']}_{row['time_bin']}"
        row["fingerprint_hash"] = hashlib.sha256(hash_input.encode()).hexdigest()
        hashed.append(row)
    return hashed

Generate Hashed Fingerprint for Clip

In [0]:
# Song to test
test_song = "Luther Kendrick Lamar & SZA"

# 1. Download MP3
downloaded_mp3 = download_mp3_clip(test_song)

# 2. Extract 10–20s clip
if downloaded_mp3:
    clip_path = os.path.join(download_path, f"clip_{os.path.basename(downloaded_mp3).replace('.mp3', '.wav')}")
    clip_file = extract_clip(downloaded_mp3, clip_path)

    # 3. Fingerprint + hash
    if clip_file:
        flattened = generate_flattened_fingerprint(clip_file)
        hashed = hash_fingerprints(flattened)

        # 4. Save to CSV
        if hashed:
            df = pd.DataFrame(hashed)
            output_csv = os.path.join(fingerprint_csv_path, f"{os.path.basename(clip_file).replace('.wav', '_hashed.csv')}")
            df.to_csv(output_csv, index=False)
            print(f"Hashed fingerprint CSV saved: {output_csv}")


### Clip Audio Processing

In [0]:
clip_csv_path = "dbfs:/FileStore/staging/fingerprint_clips/"

df_fingerprints = spark.read.option("header", True).option("inferSchema", True).csv(clip_csv_path)

df_fingerprints.limit(5).show(truncate=False)

+--------------------------------------------------------------+-----+-----+----------+-----------+
|filename                                                      |freq1|freq2|delta_time|anchor_time|
+--------------------------------------------------------------+-----+-----+----------+-----------+
|clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3|9    |14   |28        |56         |
|clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3|20   |20   |25        |683        |
|clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3|20   |21   |45        |708        |
|clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3|20   |21   |14        |783        |
|clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3|21   |21   |44        |753        |
+--------------------------------------------------------------+-----+-----+----------+-----------+



In [0]:
from pyspark.sql.functions import sha2, concat_ws

# Concatenate fingerprint columns into a string
df_hashed = df_fingerprints.withColumn("fingerprint_string", concat_ws("-", "freq1", "freq2", "delta_time"))

# Apply SHA256 hashing
df_hashed = df_hashed.withColumn("fingerprint_hash", sha2("fingerprint_string", 256))

# Drop unnecessary columns if needed
df_final = df_hashed.select("filename", "freq1", "freq2", "delta_time", "fingerprint_hash")

# Show result
df_final.limit(5).toPandas()

Unnamed: 0,filename,freq1,freq2,delta_time,fingerprint_hash
0,clip_Chappell Roan - Pink Pony Club (Official ...,9,14,28,f7246d3b826462c93db1860fd706f75f066a21316fce56...
1,clip_Chappell Roan - Pink Pony Club (Official ...,20,20,25,00cdcdb6744f4f6e3db399368a10aceb51c3fba7b595e1...
2,clip_Chappell Roan - Pink Pony Club (Official ...,20,21,45,c0ebfa75580f31462de7e26b394189e2b5fc82eafce0dd...
3,clip_Chappell Roan - Pink Pony Club (Official ...,20,21,14,3e3186a31146bf6eed2e85c4fb00108e84bbed13682226...
4,clip_Chappell Roan - Pink Pony Club (Official ...,21,21,44,af4f991bff4d9467898191363ee456a5781003ad990edb...


In [0]:
df_final.write.mode("overwrite").parquet("dbfs:/FileStore/staging/fingerprint_hashes_flattened/")

### Shazam Song Matching

In [0]:
clips = spark.read.parquet("dbfs:/FileStore/staging/fingerprint_hashes_flattened/")

# column_mapping = {
#     "filename": "song_name",
#     "fingerprint_hash": "audio_fingerprint",
#     "freq1": "anchor_freq",
#     "freq2": "target_freq",
#     "delta_time": "time_diff",
#     "anchor_time": "time_position"
# }

# for old_name, new_name in column_mapping.items():
#     clips = df_spark.withColumnRenamed(old_name, new_name)

# df_spark.show()


clips.limit(5).toPandas()

Unnamed: 0,filename,freq1,freq2,delta_time,fingerprint_hash
0,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,9,14,28,f7246d3b826462c93db1860fd706f75f066a21316fce56404556cec7323f937c
1,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,20,20,25,00cdcdb6744f4f6e3db399368a10aceb51c3fba7b595e100c1199a29ba46fed0
2,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,20,21,45,c0ebfa75580f31462de7e26b394189e2b5fc82eafce0dd957c0b0f17648dad87
3,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,20,21,14,3e3186a31146bf6eed2e85c4fb00108e84bbed136822261bf4853eb7c3961cf4
4,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,21,21,44,af4f991bff4d9467898191363ee456a5781003ad990edba99d516082f1d37b6b


In [0]:
pd.set_option('display.max_colwidth', None)
clips.groupBy("filename").count().orderBy("count", ascending=False).toPandas()

Unnamed: 0,filename,count
0,clip_ROSÉ & Bruno Mars - APT. (Official Music Video).mp3,266
1,clip_Kendrick Lamar - tv off (Official Audio).mp3,183
2,"clip_Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3",151
3,clip_PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,149
4,clip_Kendrick Lamar - Not Like Us.mp3,128
5,clip_Teddy Swims - Lose Control (The Village Sessions).mp3,118
6,clip_Kendrick Lamar - luther (Official Audio).mp3,105
7,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,76
8,clip_Shaboozey - A Bar Song (Tipsy) [Official Visualizer].mp3,73


In [0]:
songs = spark.read.parquet("dbfs:/FileStore/gold/fingerprint_hashes_parq")

# column_mapping = {
#     "filename": "song_name",
#     "fingerprint_hash": "audio_fingerprint",
#     "freq1": "anchor_freq",
#     "freq2": "target_freq",
#     "delta_time": "time_diff",
#     "anchor_time": "time_position"
# }

# for old_name, new_name in column_mapping.items():
#     clips = df_spark.withColumnRenamed(old_name, new_name)

# df_spark.show()


songs.limit(5).toPandas()

Unnamed: 0,filename,freq1,freq2,delta_time,fingerprint_hash
0,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,3,3,21,ad4de2b3fc8d1f2eff326adfad54f125974baeb8cb8371675c25657bb9d11df0
1,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,3,3,20,a7d3bfa0105bd7417c6fd6fdf3112fcabff51053116f52c20be19d8199974970
2,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,4,4,20,4feff89528864f9953e2ea0ece0425e7c88164d1d707f69256c5eb8aedad5c8a
3,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,4,4,37,77d2fa02ee47fa9a14f7d03cf87bcc328acddd7138a87609ebd763a56b02b36d
4,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,4,4,39,89174d5be323c9c0a36bc2146a24741506a4714e404fd6dff611bc444a3b7268


Match Clip Hashed Fingerprints with Gold Hashes

In [0]:
clip = clips.filter(clips["filename"] == "clip_Kendrick Lamar - tv off (Official Audio).mp3")
clip.limit(5).toPandas()

Unnamed: 0,filename,freq1,freq2,delta_time,fingerprint_hash
0,clip_Kendrick Lamar - tv off (Official Audio).mp3,3,3,22,d55369e8a9f4ac2c15597de5b8a1263732ccd6ad985cfb29dfecabb372ad025b
1,clip_Kendrick Lamar - tv off (Official Audio).mp3,3,3,20,a7d3bfa0105bd7417c6fd6fdf3112fcabff51053116f52c20be19d8199974970
2,clip_Kendrick Lamar - tv off (Official Audio).mp3,3,3,23,9e0b3749e6ef438e9cff4632dd87beca505c8ff4db41ca4c37c85b895c3e653b
3,clip_Kendrick Lamar - tv off (Official Audio).mp3,3,3,13,eaeab7f54d086ac137e10d183adf80915ce4265e1adc72ef120c506314885a12
4,clip_Kendrick Lamar - tv off (Official Audio).mp3,3,3,39,b5130b548bb6f4d0ad2872311a052a48a30cab6b45e96d5005960315c4854087


In [0]:
joined = clip.alias("clip") \
    .join(songs.alias("gold"), col("clip.fingerprint_hash") == col("gold.fingerprint_hash")) \
    .select("clip.fingerprint_hash", "gold.filename")

In [0]:
from pyspark.sql.functions import col, count

match_counts = joined.groupBy("filename").agg(count("fingerprint_hash").alias("match_count"))
match_counts.toPandas()

Unnamed: 0,filename,match_count
0,Kendrick Lamar - tv off (Official Audio).mp3,149
1,"Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3",12
2,Teddy Swims - Lose Control (The Village Sessions).mp3,5
3,Kendrick Lamar - Not Like Us.mp3,197
4,PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,5
5,ROSÉ & Bruno Mars - APT. (Official Music Video).mp3,19
6,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,2
7,Kendrick Lamar - luther (Official Audio).mp3,9
8,Shaboozey - A Bar Song (Tipsy) [Official Visualizer].mp3,9
9,Chappell Roan - Pink Pony Club (Official Music Video).mp3,21


Count Matches per Song (Confidence)

In [0]:
from pyspark.sql.functions import col, count, desc, expr

clip_total_hashes = clip.count()
match_counts = match_counts.withColumn("confidence", expr(f"match_count / {clip_total_hashes}"))
match_counts.toPandas()

Unnamed: 0,filename,match_count,confidence
0,Kendrick Lamar - tv off (Official Audio).mp3,149,0.814208
1,"Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3",12,0.065574
2,Teddy Swims - Lose Control (The Village Sessions).mp3,5,0.027322
3,Kendrick Lamar - Not Like Us.mp3,197,1.076503
4,PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,5,0.027322
5,ROSÉ & Bruno Mars - APT. (Official Music Video).mp3,19,0.103825
6,Billie Eilish - BIRDS OF A FEATHER (Official Music Video).mp3,2,0.010929
7,Kendrick Lamar - luther (Official Audio).mp3,9,0.04918
8,Shaboozey - A Bar Song (Tipsy) [Official Visualizer].mp3,9,0.04918
9,Chappell Roan - Pink Pony Club (Official Music Video).mp3,21,0.114754


In [0]:
threshold = 0.7  # 70% of clip hashes must match to be considered a confident match
result = match_counts.filter(col("confidence") >= threshold).orderBy(desc("confidence"))
result.toPandas()

Unnamed: 0,filename,match_count,confidence
0,Kendrick Lamar - Not Like Us.mp3,197,1.076503
1,Kendrick Lamar - tv off (Official Audio).mp3,149,0.814208


App Results

In [0]:
from pyspark.sql.functions import col

# Rename clip DataFrame columns
clips = clips.select(
    col("filename").alias("clip_filename"),
    col("freq1").alias("clip_freq1"),
    col("freq2").alias("clip_freq2"),
    col("delta_time").alias("clip_delta_time"),
    col("delta_time").alias("clip_anchor_time"),
    col("fingerprint_hash")
)

# Rename song DataFrame columns
songs = songs.select(
    col("filename").alias("song_filename"),
    col("freq1").alias("song_freq1"),
    col("freq2").alias("song_freq2"),
    col("delta_time").alias("song_delta_time"),
    col("delta_time").alias("song_anchor_time"),
    col("fingerprint_hash")
)

In [0]:
matches = clips.join(
    songs,
    on="fingerprint_hash",  # Match based on shared hash
    how="inner"
)

In [0]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
from pyspark.sql.functions import count

match_counts = matches.groupBy("clip_filename", "song_filename").agg(count("*").alias("num_matches"))

w = Window.partitionBy("clip_filename").orderBy(match_counts["num_matches"].desc())

best_matches = match_counts.withColumn("rank", row_number().over(w)).filter("rank = 1")

results = best_matches.select("clip_filename", "song_filename", "num_matches")
results.toPandas()

Unnamed: 0,clip_filename,song_filename,num_matches
0,clip_Chappell Roan - Pink Pony Club (Official Music Video).mp3,Chappell Roan - Pink Pony Club (Official Music Video).mp3,83
1,clip_Kendrick Lamar - Not Like Us.mp3,Kendrick Lamar - Not Like Us.mp3,1641
2,clip_Kendrick Lamar - luther (Official Audio).mp3,PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,200
3,clip_Kendrick Lamar - tv off (Official Audio).mp3,Kendrick Lamar - Not Like Us.mp3,197
4,"clip_Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3","Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3",184
5,clip_PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,PARTYNEXTDOOR & DRAKE - NOKIA (Official Lyric Video).mp3,1231
6,clip_ROSÉ & Bruno Mars - APT. (Official Music Video).mp3,ROSÉ & Bruno Mars - APT. (Official Music Video).mp3,200
7,clip_Shaboozey - A Bar Song (Tipsy) [Official Visualizer].mp3,"Lady Gaga, Bruno Mars - Die With A Smile (Official Music Video).mp3",46
8,clip_Teddy Swims - Lose Control (The Village Sessions).mp3,Teddy Swims - Lose Control (The Village Sessions).mp3,446


In [0]:
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import expr

results = results.withColumn("true_song_name", regexp_replace("clip_filename", "^clip_", ""))
results = results.withColumn("is_correct", expr("true_song_name = song_filename"))

accuracy = results.filter("is_correct = true").count() / results.count()
print(f"Matching accuracy: {accuracy * 100:.2f}%")

Matching accuracy: 66.67%


In [0]:
results.groupBy("is_correct").count().toPandas()

Unnamed: 0,is_correct,count
0,True,6
1,False,3


Save Results

In [0]:
df_score.write.mode("overwrite").parquet("dbfs:/FileStore/staging/match_results/")

In [0]:
spark.stop()