# Creating Fingerprints

In [None]:
pip install librosa

In [None]:
# HDFS base paths
hdfs_lakehouse_base_path = 'hdfs://localhost:9000/lakehouse/'
hdfs_warehouse_base_path = 'hdfs://localhost:9000/warehouse'

In [1]:
import librosa
import librosa.display
import numpy as np
import os
import matplotlib.pyplot as plt

# Define folder containing MP3 files
input_folder = "songs_mp3"  
output_folder = "fingerprints"
os.makedirs(output_folder, exist_ok=True)  

def generate_fingerprint(file_path):
    """Extracts a fingerprint (spectrogram peaks) from an audio file"""
    try:
        # Load audio file
        y, sr = librosa.load(file_path, sr=44100)
        
        # Compute Spectrogram
        D = librosa.amplitude_to_db(np.abs(librosa.stft(y)), ref=np.max)
        
        # Get peaks in the spectrogram (Shazam-like fingerprinting)
        peaks = np.argwhere(D > np.percentile(D, 95))  # Extract top 5% peaks
        
        # Convert peaks into a unique fingerprint
        fingerprint = peaks.tolist()  # Convert to list format
        
        # Save fingerprint as a NumPy file
        fingerprint_file = os.path.join(output_folder, os.path.basename(file_path).replace(".mp3", ".npy"))
        np.save(fingerprint_file, fingerprint)
        
        print(f"Fingerprint created for {os.path.basename(file_path)}")

        plt.figure(figsize=(10, 6))
        librosa.display.specshow(D, sr=sr, x_axis="time", y_axis="log")
        plt.colorbar(format="%+2.0f dB")
        plt.title(f"Spectrogram - {os.path.basename(file_path)}")
        plt.savefig(f"{file_path.replace('.mp3', '.png')}")
        plt.close()

    except Exception as e:
        print(f"Error processing {file_path}: {e}")

# Process all MP3 files in the folder
for file in os.listdir(input_folder):
    if file.endswith(".mp3"):
        file_path = os.path.join(input_folder, file)
        generate_fingerprint(file_path)

Fingerprint created for Michael Jackson - Billie Jean (Official Video).mp3
