# BDA Project: Create Your Own Spotify Experience

### Feature Extraction and Transformation


#### Group members:

- Aaqib Ahmed Nazir (i22-1920),
- Arhum Khan (i22-1967),
- Ammar Khasif (i22-1968)

##### Section: DS-D


#### Libraries Used:


In [1]:
import os
import pymongo
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed, Memory
from sklearn.preprocessing import MinMaxScaler

### Loading the file paths

In [7]:
tracks = pd.read_csv('fma_metadata\\tracks.csv')
genres = pd.read_csv('fma_metadata\\genres.csv')
features = pd.read_csv('fma_metadata\\features.csv')
echonest = pd.read_csv('fma_metadata\\echonest.csv')
raw_albums = pd.read_csv('fma_metadata\\raw_albums.csv')
raw_artists = pd.read_csv('fma_metadata\\raw_artists.csv')
raw_genres = pd.read_csv('fma_metadata\\raw_genres.csv')
raw_tracks = pd.read_csv('fma_metadata\\raw_tracks.csv')
raw_echonest = pd.read_csv('fma_metadata\\raw_echonest.csv')

### Function to get artist name 
for testing purposes

In [8]:
def find_artist(artist_name):
    artist_name = artist_name.lower()
    raw_artists["artist_name"] = raw_artists["artist_name"].str.lower()

    # Checking if artist is in the dataset
    if artist_name.lower() in raw_artists["artist_name"].values:
        print("Artist found")
    else:
        print("Artist not found")


artist_name = "lucky dragons"
find_artist(artist_name)

Artist found


### Function to extract features from the audio files and Normalize them
loads an audio file using Librosa library, then extracts three features: MFCC, spectral centroid, and zero-crossing rate. It caches the results for faster access later. If an error occurs, it prints an error message and returns empty arrays.

In [9]:
def extract_features(file):
    try:
        y, sr = librosa.load(file, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        
        # Normalizing features
        scaler = MinMaxScaler()
        mfcc_normalized = scaler.fit_transform(mfcc.T).T.tolist()
        spectral_centroid_normalized = scaler.fit_transform(spectral_centroid.T).T.tolist()
        zero_crossing_rate_normalized = scaler.fit_transform(zero_crossing_rate.T).T.tolist()
        
        return {"file_name": os.path.basename(file), 
                "mfcc": mfcc_normalized, 
                "spectral_centroid": spectral_centroid_normalized, 
                "zero_crossing_rate": zero_crossing_rate_normalized}
    except Exception as e:
        print(f"Error loading file {file}: {e}")
        return None

### Parallel Audio Feature Extraction and Adding the Features to MongoDB 
utilizes parallel processing to extract audio features from each file concurrently, enhancing computational efficiency. It employs the os.walk function for directory traversal, Parallel from joblib for parallelism, and tqdm for a progress bar display.

In [10]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["audio_features"]
collection = db["features"]

AUDIO_DIR = r"fma_small1"

# Getting all audio files
audio_files = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in files:
        if file.endswith(".mp3"):
            audio_files.append(os.path.join(root, file))
            
# Using joblib to extract features in parallel
features = Parallel(n_jobs=-1)(
    delayed(extract_features)(file) for file in tqdm(audio_files, total=len(audio_files))
)
            
features = [f for f in features if f is not None]

# Insert features into MongoDB
collection.insert_many(features)

100%|██████████| 386/386 [00:21<00:00, 17.92it/s]


InsertManyResult([ObjectId('6636756086a01b5829404226'), ObjectId('6636756086a01b5829404227'), ObjectId('6636756086a01b5829404228'), ObjectId('6636756086a01b5829404229'), ObjectId('6636756086a01b582940422a'), ObjectId('6636756086a01b582940422b'), ObjectId('6636756086a01b582940422c'), ObjectId('6636756086a01b582940422d'), ObjectId('6636756086a01b582940422e'), ObjectId('6636756086a01b582940422f'), ObjectId('6636756086a01b5829404230'), ObjectId('6636756086a01b5829404231'), ObjectId('6636756086a01b5829404232'), ObjectId('6636756086a01b5829404233'), ObjectId('6636756086a01b5829404234'), ObjectId('6636756086a01b5829404235'), ObjectId('6636756086a01b5829404236'), ObjectId('6636756086a01b5829404237'), ObjectId('6636756086a01b5829404238'), ObjectId('6636756086a01b5829404239'), ObjectId('6636756086a01b582940423a'), ObjectId('6636756086a01b582940423b'), ObjectId('6636756086a01b582940423c'), ObjectId('6636756086a01b582940423d'), ObjectId('6636756086a01b582940423e'), ObjectId('6636756086a01b58294042

### Printing the extracted features 
for testing purposes

In [11]:
# priting the first 5 records
for feature in collection.find().limit(5):
    print(feature)

{'_id': ObjectId('6636756086a01b5829404226'), 'file_name': '000002.mp3', 'mfcc': [[0.0, 0.25709760189056396, 0.49164944887161255, 0.5496727228164673, 0.5251551866531372, 0.5527492761611938, 0.6327673196792603, 0.720703125, 0.7585131525993347, 0.7581847906112671, 0.7601475715637207, 0.7712259888648987, 0.7785813808441162, 0.7519489526748657, 0.7171134948730469, 0.7065713405609131, 0.6955935955047607, 0.6808292865753174, 0.6817994117736816, 0.665859043598175, 0.6214193105697632, 0.5902711153030396, 0.5844055414199829, 0.5809338688850403, 0.5873035192489624, 0.5885547399520874, 0.6227949261665344, 0.6733428239822388, 0.6781337261199951, 0.6892073154449463, 0.7021180391311646, 0.689793586730957, 0.6639174818992615, 0.6033399105072021, 0.5271540284156799, 0.5011852979660034, 0.4932843744754791, 0.5013319253921509, 0.6703356504440308, 0.8470132350921631, 0.8791532516479492, 0.8666107058525085, 0.8565496206283569, 0.8445783257484436, 0.8433839082717896, 0.8366425633430481, 0.8314650058746338,