# BDA Project: Create Your Own Spotify Experience

### Feature Extraction and Transformation


#### Group members:

- Aaqib Ahmed Nazir (i22-1920),
- Arhum Khan (i22-1967),
- Ammar Khasif (i22-1968)

##### Section: DS-D


#### Libraries Used:


In [1]:
import os
import pymongo
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed, Memory
from sklearn.preprocessing import MinMaxScaler

### Loading the file paths

In [3]:
tracks = pd.read_csv('fma_metadata\\tracks.csv')
genres = pd.read_csv('fma_metadata\\genres.csv')
features = pd.read_csv('fma_metadata\\features.csv')
echonest = pd.read_csv('fma_metadata\\echonest.csv')
raw_albums = pd.read_csv('fma_metadata\\raw_albums.csv')
raw_artists = pd.read_csv('fma_metadata\\raw_artists.csv')
raw_genres = pd.read_csv('fma_metadata\\raw_genres.csv')
raw_tracks = pd.read_csv('fma_metadata\\raw_tracks.csv')
raw_echonest = pd.read_csv('fma_metadata\\raw_echonest.csv')

### Function to get artist name 
for testing purposes

In [4]:
def find_artist(artist_name):
    artist_name = artist_name.lower()
    raw_artists["artist_name"] = raw_artists["artist_name"].str.lower()

    # Checking if artist is in the dataset
    if artist_name.lower() in raw_artists["artist_name"].values:
        print("Artist found")
    else:
        print("Artist not found")


artist_name = "lucky dragons"
find_artist(artist_name)

Artist found


### Function to extract features from the audio files
loads an audio file using Librosa library, then extracts three features: MFCC, spectral centroid, and zero-crossing rate. It caches the results for faster access later. If an error occurs, it prints an error message and returns empty arrays.

In [5]:
memory = Memory("cache", verbose=0)
@memory.cache
def extract_features(file):
    try:
        y, sr = librosa.load(file, sr=None)
        mfcc = librosa.feature.mfcc(y=y, sr=sr)
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
        return mfcc, spectral_centroid, zero_crossing_rate
    except Exception as e:
        print(f"Error loading file {file}: {e}")
        return np.array([]), np.array([]), np.array([])


### Parallel Audio Feature Extraction
utilizes parallel processing to extract audio features from each file concurrently, enhancing computational efficiency. It employs the os.walk function for directory traversal, Parallel from joblib for parallelism, and tqdm for a progress bar display.

In [6]:
AUDIO_DIR = r"fma_small1"

# Getting all the audio files
audio_files = []
for root, dirs, files in os.walk(AUDIO_DIR):
    for file in files:
        if file.endswith(".mp3"):
            audio_files.append(os.path.join(root, file))

# Extracting features in parallel
features = Parallel(n_jobs=-1)(
    delayed(extract_features)(file) for file in tqdm(audio_files, total=len(audio_files))
)

100%|██████████| 386/386 [00:01<00:00, 223.41it/s]


### Printing the extracted features 
for testing purposes

In [7]:
# Unpack features
mfcc_features, spectral_centroid_features, zero_crossing_rate_features = zip(*features)

# Print the type of each feature
print(type(mfcc_features))
print(type(spectral_centroid_features))
print(type(zero_crossing_rate_features))

<class 'tuple'>
<class 'tuple'>
<class 'tuple'>


In [1]:
#print(spectral_centroid_features)

In [None]:
# Convert features to NumPy arrays
mfcc_features_list = np.concatenate([f.T for f in mfcc_features if f.size > 0], axis=0)
spectral_centroid_features_list = np.concatenate([f.T for f in spectral_centroid_features if f.size > 0], axis=0)
zero_crossing_rate_features_list = np.concatenate([f.T for f in zero_crossing_rate_features if f.size > 0], axis=0)


# RobustScaler for standardization
normalizer = MinMaxScaler()
robust_standardized_mfcc_features = normalizer.fit_transform(mfcc_features_list)
normalized_spectral_centroid = normalizer.fit_transform(spectral_centroid_features_list)
normalized_zero_crossing_rate = normalizer.fit_transform(zero_crossing_rate_features_list)

# Display the first 5 rows of the normalized MFCC features
# print("Standardized MFCC Features: ",robust_standardized_mfcc_features[:5])
# print("________________________")
# print("Normalized Spectral Centrioid: ",normalized_spectral_centroid[:5])
# print("________________________")
# print("Normalized Zero Crossing Rate: ",normalized_zero_crossing_rate[:5])

### PCA Dimensionality Reduction
applies PCA to reduce the dimensionality of the extracted features. It uses the PCA function from the scikit-learn library to perform the transformation.

In [10]:
# applying pca on the features
from sklearn.decomposition import PCA

# Initialize PCA with 2 components
pca = PCA(n_components=2)

# Fit and transform the standardized MFCC features
pca_mfcc_features = pca.fit_transform(robust_standardized_mfcc_features)

# Display the shape of the PCA features
print(pca_mfcc_features)

[[-0.16465878  0.24043767]
 [-0.28042835 -0.04276589]
 [-0.09782554 -0.11769146]
 ...
 [ 0.18282329  0.03536527]
 [ 0.15205988  0.00951085]
 [ 0.10313283 -0.00604354]]


In [11]:
# # save the normalized features to a npy
# np.save('normalized_mfcc_features.npy', robust_standardized_mfcc_features)
# np.save('normalized_spectral_centroid.npy', normalized_spectral_centroid)
# np.save('normalized_zero_crossing_rate.npy', normalized_zero_crossing_rate)


### Saving Audio Features to MongoDB

In [None]:
client = pymongo.MongoClient("mongodb://localhost:27017/")
db = client["music_features"]
mfcc_collection = db["mfcc_features"]
spectral_centroid_collection = db["spectral_centroid"]
zero_crossing_rate_collection = db["zero_crossing_rate"]

# Converting the features to a list of dictionaries
robust_standardized_mfcc_features_dict = [{"value": x.tolist()} for x in robust_standardized_mfcc_features]
normalized_spectral_centroid_dict = [{"value": x.tolist()} for x in normalized_spectral_centroid]
normalized_zero_crossing_rate_dict = [{"value": x.tolist()} for x in normalized_zero_crossing_rate]

# Inserting the features into the collection
mfcc_collection.insert_many(robust_standardized_mfcc_features_dict)
spectral_centroid_collection.insert_many(normalized_spectral_centroid_dict)
zero_crossing_rate_collection.insert_many(normalized_zero_crossing_rate_dict)


### Displaying the extracted features 
for testing purposes

In [2]:
# Display the first document in the collection
# print(mfcc_collection.find_one())
# print(spectral_centroid_collection.find_one())
# print(zero_crossing_rate_collection.find_one())