# Download the libraries
- h5py:
    - Allows us to read or write to .h5 which helps us access data of many songs/artists to give the use info
- tqdm:
    - Creates progress bars which is useful because it helps us track the progress of our neural networks and loops we will have

In [None]:
%pip install h5py
%pip install tqdm

---
# Importing libraries

- **OS**: allows us to access files on our system
- **h5py**: allows us to read or write to .h5 files
- **numpy**: allows us to manage our data making it easier to understand it
- **pandas**: allows us to look at our data in a table
- **librosa**: allows us to get information from our song file
- **json**: allows us to access json files
- **StandardScaler**: allows us to ensure our data is normalized
- **cosine_similarity**: allows us to determine the similarities within our data

In [None]:
import os
import h5py
import numpy as np
import pandas as pd
import librosa
import json
from sklearn.exceptions import NotFittedError
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity

---
## Pre-Processing the song
### In order to have a model that creates a playlist for the user based on the '.wav' files they have in the 'Test_Folder' we need to extract the information from those songs swell as the song information from the '.h5' files

We extract the information from the file path of the song information by retrieving the:
- **MFCC**: short term spectrums of sound
- **Chroma**: different pitch classes within an octave
- **Timber**: quality of a sound that distuingishes it from other sounds
- **Pitch**: the frequency of a sound

In [184]:
def preprocess_song_for_recommendation(file_path):
    """
    Extracts MFCC and Chroma features from a user's song.

    Returns:
        np.ndarray: Combined feature vector (12 MFCC mean + 12 Chroma mean).
    """
    SAMPLE_RATE = 22050
    n_mfcc = 12  # Match MSD timbre features
    n_fft = 2048
    hop_length = 512

    try:
        signal, sr = librosa.load(file_path, sr=SAMPLE_RATE)

        # Extract MFCC features
        mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc,
                                    n_fft=n_fft, hop_length=hop_length)
        mfcc_mean = np.mean(mfcc, axis=1)  # Shape: (12,)

        # Extract Chroma features
        chroma = librosa.feature.chroma_stft(y=signal, sr=sr,
                                             n_fft=n_fft, hop_length=hop_length)
        chroma_mean = np.mean(chroma, axis=1)  # Shape: (12,)

        # Combine features
        features_vector = np.concatenate((mfcc_mean, chroma_mean))  # Shape: (24,)
        return features_vector
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [185]:
def extract_song_features_for_recommendation(h5_file_path):
    """
    Extracts timbre and pitch features from an MSD song.

    Returns:
        dict: Dictionary containing track_id, artist_name, song_title, and combined feature vector.
    """
    try:
        with h5py.File(h5_file_path, 'r') as h5_file:
            # Extract metadata
            track_id = h5_file['/analysis/songs'][:]['track_id'][0].decode('utf-8')
            artist_name = h5_file['/metadata/songs'][:]['artist_name'][0].decode('utf-8')
            song_title = h5_file['/metadata/songs'][:]['title'][0].decode('utf-8')

            # Extract timbre features
            segments_timbre = h5_file['/analysis/segments_timbre'][:]
            timbre_mean = segments_timbre.mean(axis=0)  # Shape: (12,)

            # Extract pitch features
            segments_pitches = h5_file['/analysis/segments_pitches'][:]
            pitches_mean = segments_pitches.mean(axis=0)  # Shape: (12,)

            # Combine features
            features_vector = np.concatenate((timbre_mean, pitches_mean))  # Shape: (24,)

            features = {
                'track_id': track_id,
                'artist_name': artist_name,
                'song_title': song_title,
                'features_vector': features_vector
            }
            return features
    except Exception as e:
        print(f"Error processing {h5_file_path}: {e}")
        return None

---
# Building dataframes

From the information we just extracted we will now build a dataframe to represent both of the features that have been extracted

In [186]:
def build_msd_features_dataframe(msd_dir):
    """
    Builds a DataFrame containing features and metadata from MSD songs.

    Returns:
        pd.DataFrame: DataFrame with track_id, artist_name, song_title, and features_vector.
    """
    h5_files = []
    for root, dirs, files in os.walk(msd_dir):
        for file in files:
            if file.lower().endswith('.h5'):
                h5_files.append(os.path.join(root, file))

    msd_metadata_list = []
    msd_features_list = []

    print(f"Total MSD files to process: {len(h5_files)}")
    for h5_file in tqdm(h5_files, desc="Processing MSD .h5 files"):
        features = extract_song_features_for_recommendation(h5_file)
        if features:
            msd_metadata_list.append({
                'track_id': features['track_id'],
                'artist_name': features['artist_name'],
                'song_title': features['song_title']
            })
            msd_features_list.append(features['features_vector'])
    if not msd_metadata_list:
        print("No MSD features extracted. Exiting.")
        return pd.DataFrame()

    msd_metadata_df = pd.DataFrame(msd_metadata_list)
    msd_features_df = pd.DataFrame(msd_features_list)
    msd_combined_df = pd.concat([msd_metadata_df.reset_index(drop=True),
                                 msd_features_df.reset_index(drop=True)], axis=1)
    return msd_combined_df

In [187]:
def build_user_features_dataframe(test_songs_dir):
    """
    Builds a DataFrame containing features from user's songs.

    Returns:
        pd.DataFrame: DataFrame with features_vector and song names as index.
    """
    user_song_features = []
    user_song_names = []

    user_wav_files = [f for f in os.listdir(test_songs_dir) if f.lower().endswith('.wav')]

    print(f"Total user songs to process: {len(user_wav_files)}")
    for file_name in user_wav_files:
        file_path = os.path.join(test_songs_dir, file_name)
        features_vector = preprocess_song_for_recommendation(file_path)
        if features_vector is not None:
            user_song_features.append(features_vector)
            user_song_names.append(file_name)
    if not user_song_features:
        print("No user song features extracted. Exiting.")
        return pd.DataFrame()

    user_features_df = pd.DataFrame(user_song_features, index=user_song_names)
    # Rename feature columns to strings
    user_features_df.columns = [f'feature_{i}' for i in range(user_features_df.shape[1])]
    return user_features_df

---
# Extracting Genre's from Json DF

from the `lastfm_subset` folder we extract the genres of the songs so we can now compare the similarites of the songs utilizing genres aswell

In [188]:
def extract_genre_from_json(json_file_path, valid_genres):
    """
    Extracts the genre tags from a JSON file.

    Returns:
        tuple: (track_id, genres) where genres is a list of genres associated with the track.
    """
    try:
        with open(json_file_path, 'r') as f:
            data = json.load(f)

        # Extract track_id from file name (assuming the file name is the track ID)
        track_id = os.path.splitext(os.path.basename(json_file_path))[0]

        tags = data.get('tags', [])

        # Extract tags that are in valid_genres
        genres = [tag[0].lower() for tag in tags if tag[0].lower() in valid_genres]

        return (track_id, genres)
    except Exception as e:
        print(f"Error processing {json_file_path}: {e}")
        return None

---
# Building genre DF

now that we have our genre information extracted from the `lastfm_subset` folder we can now put it in a datframe to visualize what were working with

In [189]:
def build_msd_genre_dataframe(lastfm_dir, valid_genres):
    """
    Builds a DataFrame mapping track IDs to genres.

    Returns:
        pd.DataFrame: DataFrame with 'track_id' and 'genre' columns.
    """
    genre_data = []

    # Walk through the directory structure
    for root, dirs, files in os.walk(lastfm_dir):
        for file in files:
            if file.lower().endswith('.json'):
                json_file_path = os.path.join(root, file)
                result = extract_genre_from_json(json_file_path, valid_genres)
                if result:
                    track_id, genres = result
                    for genre in genres:
                        genre_data.append({'track_id': track_id, 'genre': genre})

    if not genre_data:
        print("No genre data extracted.")
        return pd.DataFrame()

    msd_genres_df = pd.DataFrame(genre_data)
    return msd_genres_df

---
# Selecting Specific Genres

From our `Data` --> `genres_original` subfolder we specifically want the genres based on what our NN was built on

In [228]:
valid_genres = set([
    'rock', 'pop', 'hip-hop', 'jazz', 'disco',
    'classical', 'country', 'metal', 'blues', 'reggae'
])

In [191]:
msd_subset_dir = 'MillionSongSubset'  # Directory containing MSD .h5 files
test_songs_dir = 'Test_Songs'         # Directory containing user's .wav files
lastfm_dir = 'lastfm_subset'          # Directory containing Last.fm JSON files

---
# Building each dataframe

In [206]:
# Build MSD features DataFrame
msd_features_df = build_msd_features_dataframe(msd_subset_dir)
print(f"Extracted features from {len(msd_features_df)} MSD songs.")

# Build user features DataFrame
user_features_df = build_user_features_dataframe(test_songs_dir)
print(f"Extracted features from {len(user_features_df)} user songs.")

# Build the genre DataFrame
msd_genres_df = build_msd_genre_dataframe(lastfm_dir, valid_genres)
print(f"Extracted genres for {msd_genres_df['track_id'].nunique()} tracks.")

Total MSD files to process: 10000


Processing MSD .h5 files: 100%|██████████| 10000/10000 [00:30<00:00, 330.41it/s]


Extracted features from 10000 MSD songs.
Total user songs to process: 16
Extracted features from 16 user songs.
Extracted genres for 2444 tracks.


In [207]:
print("MSD Genre DF:")
msd_genres_df.head()

MSD Genre DF:


Unnamed: 0,track_id,genre
0,TRARREF128F422FD96,rock
1,TRARUOP12903CF2384,metal
2,TRARUOP12903CF2384,rock
3,TRARINE128F4280B9C,rock
4,TRARIRG128F147FC96,pop


In [208]:
print("MSD Features DF:")
msd_features_df.head()

MSD Features DF:


Unnamed: 0,track_id,artist_name,song_title,0,1,2,3,4,5,6,...,14,15,16,17,18,19,20,21,22,23
0,TRARRZU128F4253CA2,Raphaël,Je Sais Que La Terre Est Plate,44.303553,-5.497527,-13.431835,-8.424728,-8.452409,-13.410788,12.765696,...,0.189315,0.219311,0.280203,0.182322,0.173269,0.163438,0.121626,0.158794,0.143032,0.247655
1,TRARRJL128F92DED0E,Julie Zenatti,On Efface,40.842959,-37.80359,54.445222,0.69218,25.061545,-12.36697,-18.276061,...,0.144745,0.348158,0.277078,0.105659,0.237481,0.223863,0.398764,0.255276,0.134022,0.143274
2,TRARRUZ128F9307C57,The Baltimore Consort,Howells Delight,29.594024,-59.840088,26.853805,-13.406868,-13.89458,-2.407649,-6.070068,...,0.4736,0.387059,0.210527,0.248849,0.168312,0.3752,0.345712,0.221254,0.190839,0.228405
3,TRARRWA128F42A0195,I Hate Sally,Martha Served,51.354689,29.592192,36.705763,-1.993846,-21.519207,-27.862433,7.420505,...,0.422876,0.419156,0.432793,0.45413,0.545194,0.518192,0.437224,0.38697,0.449785,0.538041
4,TRARRPG12903CD1DE9,Orlando Pops Orchestra,Zip-A-Dee-Doo-Dah (Song of the South),35.141368,-71.545376,27.618004,-21.010282,-2.18766,-13.01988,-22.567939,...,0.409202,0.338914,0.227782,0.55045,0.256027,0.336616,0.145906,0.216826,0.498927,0.202615


In [209]:
print("User Features DF:")
user_features_df.head()

User Features DF:


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_14,feature_15,feature_16,feature_17,feature_18,feature_19,feature_20,feature_21,feature_22,feature_23
Die With a Smile.wav,-98.599808,96.48587,-14.750042,7.125236,-4.520143,-0.183903,-4.793099,0.909005,-4.207834,5.561567,...,0.299907,0.303318,0.341033,0.232503,0.367716,0.234977,0.428613,0.376863,0.231098,0.289637
Heart To Heart.wav,-273.463928,147.279144,51.137894,36.550465,14.893621,9.92991,6.789459,8.467576,4.899261,4.872872,...,0.425025,0.384189,0.34896,0.351834,0.440942,0.364073,0.451867,0.444617,0.512134,0.595365
The Bravery - Believe.wav,9.772176,83.606133,-14.504303,24.469784,5.09048,10.645214,-4.849209,3.386054,-3.261358,7.821845,...,0.306923,0.416885,0.306801,0.460344,0.281478,0.386047,0.500861,0.45949,0.596705,0.353673
Bennie And The Jets.wav,-56.856064,67.554993,-0.577841,20.146826,0.682751,7.891598,-4.224215,0.763109,-8.604556,2.517297,...,0.485957,0.412012,0.527974,0.384662,0.427079,0.42551,0.34254,0.380496,0.368237,0.464063
My Chemical Romance.wav,11.036662,70.489258,-11.634374,32.498356,-1.139831,7.493962,-2.068378,9.410398,-5.127173,0.973122,...,0.386449,0.453625,0.531772,0.447067,0.471254,0.475668,0.535725,0.452447,0.409431,0.467772


---
### Preparing the Music Data

1. **Add Genre Information**:  
   We combine `msd_features_df` with `msd_genres_df` to add genre details for each song based on the `track_id`. This keeps all rows in `msd_features_df` and fills in genre data where it’s available.

2. **Remove Songs without Genre**:  
   Any songs missing a genre are removed from `msd_features_df` to make sure we're only working with songs that have genre information.

3. **Find Feature Columns**:  
   We use the columns that are actual features since thats the dataset were working with from the `Data` folder we have

4. **Rename Features**:  
   Each feature column is renamed to a simple format (`feature_1`, `feature_2`, etc.), making it easier to work with them.

5. **List of Features**:  
   We create a list called `msd_feature_columns` that has all the renamed feature columns. This list helps us quickly find and work with these features.

With this, `msd_features_df` is ready for analysis by keeping only songs with genres and organizing the features.

In [210]:
# Merge genre information with msd_features_df
msd_features_df = msd_features_df.merge(msd_genres_df, on='track_id', how='left')

# Drop songs without genre
msd_features_df = msd_features_df.dropna(subset=['genre'])
print(f"\nMSD songs with genre information: {len(msd_features_df)}")

msd_features_df.head()


MSD songs with genre information: 3584


Unnamed: 0,track_id,artist_name,song_title,0,1,2,3,4,5,6,...,15,16,17,18,19,20,21,22,23,genre
8,TRARREF128F422FD96,Dead Kennedys,Halloween,51.660966,27.266711,58.586305,3.364532,-45.861889,-6.946116,-2.702805,...,0.315906,0.332867,0.408261,0.40235,0.509071,0.401992,0.398654,0.45557,0.380523,rock
12,TRARUOP12903CF2384,Shadows Fall,What Drives The Weak,51.455478,60.703364,5.728911,-5.926448,-23.673065,-24.285592,12.544189,...,0.387431,0.29444,0.349226,0.284251,0.456537,0.414778,0.283694,0.415018,0.308693,metal
13,TRARUOP12903CF2384,Shadows Fall,What Drives The Weak,51.455478,60.703364,5.728911,-5.926448,-23.673065,-24.285592,12.544189,...,0.387431,0.29444,0.349226,0.284251,0.456537,0.414778,0.283694,0.415018,0.308693,rock
22,TRARIRG128F147FC96,Phil Collins,I'm Not Moving,34.094961,3.196366,-21.487197,-1.977982,4.372649,1.335322,-5.05178,...,0.314434,0.233979,0.452492,0.267236,0.471991,0.319859,0.245541,0.321288,0.274183,pop
23,TRARIRG128F147FC96,Phil Collins,I'm Not Moving,34.094961,3.196366,-21.487197,-1.977982,4.372649,1.335322,-5.05178,...,0.314434,0.233979,0.452492,0.267236,0.471991,0.319859,0.245541,0.321288,0.274183,rock


In [211]:
# Identify feature columns (exclude 'track_id', 'artist_name', 'song_title', 'genre')
feature_columns = msd_features_df.columns.difference(['track_id', 'artist_name', 'song_title', 'genre'])

# Rename feature columns to 'feature_1', 'feature_2', ..., 'feature_N'
msd_features_df = msd_features_df.rename(columns={col: f'feature_{i}' for i, col in enumerate(feature_columns, 1)})

# List of renamed feature columns
msd_feature_columns = [col for col in msd_features_df.columns if col.startswith('feature_')]

---
# Normalizing our Data

In [None]:
print("\nNormalizing MSD features...")
scaler_msd = StandardScaler()

# Check if all feature columns are numeric before normalization
if msd_features_df[msd_feature_columns].select_dtypes(include=['number']).shape[1] == len(msd_feature_columns):
    msd_features_normalized = scaler_msd.fit_transform(msd_features_df[msd_feature_columns])
    print("MSD features normalized successfully.")
else:
    print("Error: Non-numeric data found in feature columns. Please check the feature columns.")
    # Optionally, inspect which columns contain non-numeric data
    non_numeric_features = msd_features_df[msd_feature_columns].columns[
        msd_features_df[msd_feature_columns].dtypes == 'object'
    ].tolist()
    print("Non-numeric feature columns:", non_numeric_features)
    # Handle non-numeric columns if any
    # For this scenario, we'll exclude them from normalization
    msd_features_normalized = scaler_msd.fit_transform(msd_features_df[msd_feature_columns].select_dtypes(include=['number']))
    # Update msd_feature_columns to only include numeric columns
    msd_feature_columns = msd_features_df[msd_feature_columns].select_dtypes(include=['number']).columns.tolist()

print("Normalizing user features...")
scaler_user = StandardScaler()

if user_features_df.select_dtypes(include=['number']).shape[1] == user_features_df.shape[1]:
    user_features_normalized = scaler_user.fit_transform(user_features_df.values)
    print("User features normalized successfully.")
else:
    print("Error: Non-numeric data found in user feature columns.")
    user_features_normalized = scaler_user.fit_transform(user_features_df.select_dtypes(include=['number']).values)


Normalizing MSD features...
MSD features normalized successfully.
Normalizing user features...
User features normalized successfully.


In [229]:
# Filter MSD songs by user's genres
msd_filtered_df = msd_features_df[msd_features_df['genre'].isin(valid_genres)]
if msd_filtered_df.empty:
    print("No MSD songs match the user's preferred genres.")
else:
    # Normalize filtered MSD features
    msd_features_normalized_filtered = scaler_msd.transform(msd_filtered_df[msd_feature_columns])

    # Compute cosine similarities
    print("\nComputing similarities...")
    similarities = cosine_similarity(user_features_normalized, msd_features_normalized_filtered)

    # Average similarities across user's songs
    average_similarities = similarities.mean(axis=0)

    # Get indices of top similar songs
    num_recommendations = 10
    top_indices = average_similarities.argsort()[::-1][:num_recommendations]

    # Retrieve recommended songs' metadata
    recommended_songs = msd_filtered_df.iloc[top_indices][['artist_name', 'song_title', 'genre']]

    # Display recommendations
    print("\nRecommended Songs (Filtered by Genre):")
    for idx, row in recommended_songs.iterrows():
        print(f"{row['song_title']} by {row['artist_name']} - Genre: {row['genre']}")


Computing similarities...

Recommended Songs (Filtered by Genre):
Sangre americana by Bacilos - Genre: rock
Just Like Lightnin' by Joe Satriani - Genre: rock
Just Like Lightnin' by Joe Satriani - Genre: metal
Disappeared by John Wesley_ John Wesley - Genre: rock
Master Of The Universe by Sick Puppies - Genre: rock
Life Is Real by Ayo - Genre: reggae
Life Is Real by Ayo - Genre: jazz
Life Is Real by Ayo - Genre: pop
Life Is Real by Ayo - Genre: blues
Thrill Me by Simply Red - Genre: pop
