Get million song subset data song list
Get metadata and join the data

use artist similarity and artists to train the model on similarity

use last.fm to get additional data on each song to augment this

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install python-dotenv



In [3]:
# imports
import pandas as pd
import h5py
import os
from sqlalchemy import create_engine
import requests
import time
from dotenv import load_dotenv

In [4]:
pd.set_option('display.max_rows', 100)

In [5]:
os.getcwd()

'/content'

In [6]:
os.chdir('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/mlops/notebooks')

In [7]:
! ls

label_encoder.joblib  model_training.ipynb  scaler.joblib	    tracks_eda.ipynb
model.pth	      README.md		    test_spotify_api.ipynb  users_eda.ipynb


# Loading Data

## Loading million song subset data

In [9]:
# load the data (only loading song_id, metadata contains the rest)
def read_song_features(file_path):
    with h5py.File(file_path, 'r') as f:
        song_id = f['metadata']['songs']['song_id'][0].decode('utf-8')
        return {'song_id': song_id}


# process all files in a directory into a df
def process_all_files_to_dataframe(root_dir):
    data = []
    print(f"Checking directory: {root_dir}")

    for subdir, dirs, files in os.walk(root_dir):
        print(f"Currently scanning {subdir} with {len(files)} files")
        for file in files:
            if file.endswith('.h5'):
                file_path = os.path.join(subdir, file)
                print(f"Processing file: {file_path}")
                song_data = read_song_features(file_path)
                data.append(song_data)

    if not data:
        print("No data to process.")

    df = pd.DataFrame(data)
    return df

In [11]:
root_dir = 'data/MillionSongSubset'
df = process_all_files_to_dataframe(root_dir)

Checking directory: ../../../data/
No data to process.


### Loading million song subset metadata from sqlite db

In [None]:
# load metadata from sqlite
def load_data_from_sqlite(db_path, table_name):
    engine = create_engine(f'sqlite:///{db_path}')
    query = f"SELECT * FROM {table_name}"
    df = pd.read_sql_query(query, engine)
    return df

# load metadata and merge with song data
db_path3 = 'data/MillionSongSubsetMetadata/track_metadata.db'
df3 = load_data_from_sqlite(db_path3, 'songs')
df = df.merge(df3, on='song_id', how='left')


In [None]:
columns_to_drop = ['track_id', 'artist_id', 'song_id', 'artist_mbid', 'track_7digitalid', 'shs_perf', 'shs_work']

for column in columns_to_drop:
    if column in df.columns:
        df.drop(columns=[column], inplace=True)

In [None]:
df.columns

In [None]:
df.head()

## Loading last.fm data

In [8]:
def fetch_data(api_key, method, params):
    base_url = "http://ws.audioscrobbler.com/2.0/"
    params['api_key'] = api_key
    params['method'] = method
    params['format'] = 'json'
    response = requests.get(base_url, params=params)
    return response.json()


def get_artist_info(api_key, artist_name):
    params = {'artist': artist_name}
    return fetch_data(api_key, 'artist.getInfo', params)


def get_track_info(api_key, artist_name, track_name):
    params = {'artist': artist_name, 'track': track_name}
    return fetch_data(api_key, 'track.getInfo', params)


def batch_fetch_data(api_key, items, fetch_function, sleep_time=1):
    results = []
    for item in items:
        result = fetch_function(api_key, *item)
        results.append(result)
        # time.sleep(sleep_time)
    return results

In [9]:
# load LASTFM_API_KEY from .env
import requests
load_dotenv()
api_key = os.getenv('LASTFM_API_KEY')


def fetch_lastfm_data(api_key, artist_name, track_name):
    base_url = "http://ws.audioscrobbler.com/2.0/"
    params = {
        'method': 'track.getInfo',
        'api_key': api_key,
        'artist': artist_name,
        'track': track_name,
        'format': 'json'
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200 and response.text.strip():
        return response.json()
    else:
        return None


def parse_lastfm_data(data):
    if data and 'track' in data:
        track = data['track']
        return {
            'listeners': track.get('listeners', '0'),
            'playcount': track.get('playcount', '0'),
            'tags': ', '.join(tag['name'] for tag in track.get('toptags', {}).get('tag', [])),
        }
    return None

In [10]:
from tqdm import tqdm
tqdm.pandas()

load_dotenv()
api_key = os.getenv('LASTFM_API_KEY')
subset_df = df.head(1000)

tracks_skipped = 0


def fetch_and_parse(row):
    global tracks_skipped
    data = fetch_lastfm_data(api_key, row['artist_name'], row['title'])
    if data is None:
        tracks_skipped += 1
        return None
    parsed_data = parse_lastfm_data(data)
    if parsed_data is None:
        tracks_skipped += 1
    return parsed_data


# Use progress_apply instead of apply
subset_df['lastfm_data'] = subset_df.progress_apply(fetch_and_parse, axis=1)

# Remove rows where lastfm_data is None
subset_df = subset_df[subset_df['lastfm_data'].notna()]

subset_df.reset_index(drop=True, inplace=True)
track_details_df = pd.json_normalize(subset_df['lastfm_data'])
mixed = pd.concat(
    [subset_df.drop(columns=['lastfm_data']), track_details_df], axis=1)

print(f"Tracks skipped: {tracks_skipped}")

mixed.to_csv('data/music_data_small.csv', index=False)

NameError: name 'df' is not defined

## Data processing

In [91]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/CMPE-258: Team Neurobytes/Neurobytes/db/data/music_data.csv')
df.dropna(inplace=True)

In [92]:
df.head()

Unnamed: 0,title,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,listeners,playcount,tags
0,100 Club 1996 ''We Love You Beatles'' - Live,Sex Pistols - The Interviews,Sex Pistols,88.73751,0.731184,0.549204,0,172,210,"The Beatles, title is a full sentence"
1,Yo Quiero Contigo,Sentenciados - Platinum Edition,Baby Rasta & Gringo,167.36608,0.610186,0.35532,0,9753,16911,"Reggaeton, alexis y fido, Eliana, mis videos, ..."
4,Emerald,Emerald,Bedrock,501.86404,0.654039,0.390625,2004,973,2247,dance
6,Karma,The Diary Of Alicia Keys,Alicia Keys,255.99955,0.933916,0.778674,2003,250304,1028356,"rnb, soul, Alicia Keys, female vocalists, Karma"
7,Money Blues,Slidetime,Joanna Connor,243.66975,0.479218,0.332857,0,429,1008,"guitar girl, blues"


In [93]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import torch.optim as optim

def label_encode_data(df):
  df = df.copy(deep=True)
  # Encode categorical data
  label_encoders = {}
  unknown_label = 'unknown'  # Define an unknown label

  for column in ['artist_name', 'tags', 'title']:
      le = LabelEncoder()

      # Get unique categories plus an 'unknown' category
      unique_categories = df[column].unique().tolist()
      # Add 'unknown' to the list of categories
      unique_categories.append(unknown_label)

      # Fit the LabelEncoder to these categories
      le.fit(unique_categories)
      df[column] = le.transform(df[column].astype(str))

      # Store the encoder
      label_encoders[column] = le

  return df, label_encoders


# Normalize numerical features
scaler = MinMaxScaler()
df[['listeners', 'playcount']] = scaler.fit_transform(
    df[['listeners', 'playcount']])

# Label encode categorical features
df_scaled, label_encoder_training = label_encode_data(df)

# Split data into features and target
X = df_scaled[['artist_name', 'listeners', 'playcount', 'tags']]
y = df_scaled['title']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [97]:
class SongRecommender(nn.Module):
    def __init__(self):
        super(SongRecommender, self).__init__()
        self.fc1 = nn.Linear(4, 128)  # Adjust input features if needed
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, 128)
        # Output size = number of unique titles including 'unknown'
        # Add 1 for the 'unknown' label
        self.output = nn.Linear(128, len(y.unique()) + 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.output(x)
        return x


model = SongRecommender()
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [98]:
def train_model(model, X_train, y_train, X_test, y_test):
    train_loader = DataLoader(
        list(zip(X_train.values.astype(float), y_train)), batch_size=50, shuffle=True)
    test_loader = DataLoader(
        list(zip(X_test.values.astype(float), y_test)), batch_size=50, shuffle=False)

    model.train()
    for epoch in range(50):  # Number of epochs
        train_loss = 0
        for features, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(torch.tensor(features).float())
            # Ensure labels are long type
            loss = criterion(outputs, torch.tensor(labels).long())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()

        # Validation phase
        model.eval()
        validation_loss = 0
        for features, labels in test_loader:
            outputs = model(torch.tensor(features).float())
            loss = criterion(outputs, torch.tensor(labels).long())
            validation_loss += loss.item()

        print(f'Epoch {epoch+1}, Training Loss: {train_loss / len(train_loader)}, Validation Loss: {validation_loss / len(test_loader)}')

In [99]:
train_model(model, X_train, y_train, X_test, y_test)

  outputs = model(torch.tensor(features).float())
  loss = criterion(outputs, torch.tensor(labels).long())
  outputs = model(torch.tensor(features).float())
  loss = criterion(outputs, torch.tensor(labels).long())


Epoch 1, Training Loss: 19.32844744788276, Validation Loss: 8.51949759892055
Epoch 2, Training Loss: 8.476903479776265, Validation Loss: 8.578226861499605
Epoch 3, Training Loss: 8.463194564536765, Validation Loss: 8.635373478844052
Epoch 4, Training Loss: 8.45077790154351, Validation Loss: 8.691407203674316
Epoch 5, Training Loss: 8.439464957625777, Validation Loss: 8.74635106041318
Epoch 6, Training Loss: 8.429126657085654, Validation Loss: 8.800388427007766
Epoch 7, Training Loss: 8.419632758623289, Validation Loss: 8.853472210112072
Epoch 8, Training Loss: 8.410879876878527, Validation Loss: 8.905590920221238
Epoch 9, Training Loss: 8.402834150526258, Validation Loss: 8.956880069914318
Epoch 10, Training Loss: 8.395392759346668, Validation Loss: 9.007283165341331
Epoch 11, Training Loss: 8.388489110970204, Validation Loss: 9.05686582837786
Epoch 12, Training Loss: 8.382096290588379, Validation Loss: 9.105663435799736
Epoch 13, Training Loss: 8.376147870664242, Validation Loss: 9.15

In [100]:
# save the model
torch.save(model.state_dict(), './model.pth')

In [101]:
# load the model
model = SongRecommender()

In [102]:
df.loc[:, ['artist_name', 'title', 'tags', 'listeners', 'playcount']].head()

Unnamed: 0,artist_name,title,tags,listeners,playcount
0,Sex Pistols,100 Club 1996 ''We Love You Beatles'' - Live,"The Beatles, title is a full sentence",7e-05,9e-06
1,Baby Rasta & Gringo,Yo Quiero Contigo,"Reggaeton, alexis y fido, Eliana, mis videos, ...",0.003978,0.000729
4,Bedrock,Emerald,dance,0.000397,9.7e-05
6,Alicia Keys,Karma,"rnb, soul, Alicia Keys, female vocalists, Karma",0.102103,0.044359
7,Joanna Connor,Money Blues,"guitar girl, blues",0.000175,4.3e-05


In [106]:
def label_encode_data(df):
  df = df.copy(deep=True)
  # Encode categorical data
  label_encoders = {}
  unknown_label = 'unknown'  # Define an unknown label

  for column in ['artist', 'tags', 'song']:
      le = LabelEncoder()

      # Get unique categories plus an 'unknown' category
      unique_categories = df[column].unique().tolist()
      # Add 'unknown' to the list of categories
      unique_categories.append(unknown_label)

      # Fit the LabelEncoder to these categories
      le.fit(unique_categories)
      df[column] = le.transform(df[column].astype(str))

      # Store the encoder
      label_encoders[column] = le

  return df, label_encoders


def recommend_songs(model, user_data, full_data = df, train_encoder=label_encoder_training):
    model.eval()
    full_data = full_data.copy(deep=True)
    with torch.no_grad():

        # Create a DataFrame with feature names
        numeric_features = user_data.loc[:, ['listeners', 'playcount']]
        text_features = user_data.loc[:, ['artist', 'tags', 'song']]

        #encoding using concatenated full dataset and evaluation set for inference
        df = full_data.loc[:, ['artist_name', 'title', 'tags', 'listeners', 'playcount']]
        df.columns = ['artist', 'song', 'tags', 'listeners', 'playcount']
        text_features_full = df.loc[:, ['artist', 'tags']]


        all_labels = pd.concat([text_features, text_features_full], axis=0)
        all_labels.reset_index(drop=True, inplace=True)

        # Get the encoder based on all categorical features
        _, label_encoders = label_encode_data(all_labels)
        scaled_features = scaler.transform(numeric_features)
        # drop the song variable from text_features
        text_features = text_features.drop('song', axis=1)

        # encode the user data
        label_encoded_data = text_features.copy(deep=True)
        for column in ['artist', 'tags']:
            label_encoded_data[column] = label_encoders[column].transform(
                label_encoded_data[column].astype(str))



        all_features = pd.concat([label_encoded_data, pd.DataFrame(scaled_features)], axis=1)
        all_features.columns = ['artist', 'tags', 'listeners', 'playcount']
        all_features.reset_index(drop=True, inplace=True)


        # converting all_features into a torch tensor as float dtype
        all_features = torch.tensor(all_features.to_numpy()).float().unsqueeze(0)

        # Make predictions
        predictions = model(all_features)

        predictions = predictions[0, :5, :] # selecting top 5
        for row in predictions:
          top_5_values, top_5_indices = row.topk(5)
          recommended_song_ids = top_5_indices.squeeze().tolist()

        try:
          return label_encoders['song'].inverse_transform(recommended_song_ids)
        except:
          return train_encoder['title'].inverse_transform(recommended_song_ids)[:5]



In [51]:
import requests


def fetch_song_data(api_key, artist_name, track_name):
    url = "http://ws.audioscrobbler.com/2.0/"
    params = {
        'method': 'track.getInfo',
        'api_key': api_key,
        'artist': artist_name,
        'track': track_name,
        'format': 'json'
    }
    response = requests.get(url, params=params)
    print(response.content)
    return response.json() if response.status_code == 200 else {}


def parse_song_data(song_data):
    if song_data and 'track' in song_data:
        track = song_data['track']
        return {
            'artist_name': track['artist']['name'],
            'tags': ', '.join([tag['name'] for tag in track.get('toptags', {}).get('tag', [])]),
            'duration': float(track.get('duration', 0)),
            'listeners': int(track.get('listeners', 0)),
            'playcount': int(track.get('playcount', 0)),
            'album': track.get('album', {}).get('title', 'Unknown')
        }
    return {}

# Importing the User Data and Making Recommendations
Let's make recommendations using the sample user's preferences.

In [63]:
import numpy as np

In [64]:
user_preferences = pd.read_csv('../../../user_preferences.csv')
user_preferences.drop('level_0', axis=1, inplace=True)

In [65]:
user_preferences.head()

Unnamed: 0,songID,artist,song,link,text,userID,listeners,playcount,tags
0,19632,Toto,You Are The Flower,/t/toto/you+are+the+flower_20139737.html,"You never lose a minute, if in it there is lov...",0,25307,87344,"AOR, rock, soft rock, 70s, pop rock"
1,19632,Toto,You Are The Flower,/t/toto/you+are+the+flower_20139737.html,"You never lose a minute, if in it there is lov...",0,25307,87344,"AOR, rock, soft rock, 70s, pop rock"
2,25284,Billie Holiday,I Only Have Eyes For You,/b/billie+holiday/i+only+have+eyes+for+you_200...,"My love must be a kind of blind love, \r\nI c...",0,60356,178625,"jazz, female vocal, vocal jazz, blues, female ..."
3,43594,Michael Bolton,Only A Woman Like You,/m/michael+bolton/only+a+woman+like+you_101792...,"It's beautiful, your honesty \r\nYou cry when...",0,4595,13266,"Ballad, romantic, soul, pop, cool"
4,50200,Rascal Flatts,The Day Before You,/r/rascal+flatts/the+day+before+you_10238985.html,I had all but given up \r\nOn finding the one...,0,22077,86012,"country, rock, contemporary country, seen live..."


In [66]:
sample_user = user_preferences.where(user_preferences['userID'] == np.random.randint(*(0, 9))).dropna()

Hopefully, the neural network makes recommendations of artists that fall into the top 5 for the user.

In [112]:
top_5 = sample_user.groupby('artist').count().mean(axis=1).sort_values(ascending=False)[:5]
top_5

artist
Diana Ross         11.0
Kenny Rogers        8.0
Backstreet Boys     8.0
One Direction       7.0
Glee                7.0
dtype: float64

In [109]:
print("#### RECOMMENDATIONS ###")
song_recs = recommend_songs(model, sample_user, df) # requires giving main song df for finding embeddings
song_recs

#### RECOMMENDATIONS ###


array(['Todo A Pulmón',
       'You Know What They Do To Guys Like Us In Prison (Album Version)',
       'Lotion (LP Version)', 'Zydeco In D-Minor', 'Sag Ihnen Bescheid'],
      dtype='<U97')

In [110]:
# finding the song artist in the main dataset
df.loc[df['title'].isin(song_recs)]

Unnamed: 0,title,release,artist_name,duration,artist_familiarity,artist_hotttnesss,year,listeners,playcount,tags
690,Todo A Pulmón,20 Años,Alejandro Lerner,267.31057,0.556751,0.361588,1998,0.00155,0.000391,"alejandro lerner, Romantica, ale"
1085,Zydeco In D-Minor,Nouveau Zydeco,"Myrick ""Freeze"" Guillory",300.82567,0.334457,0.216829,0,2.2e-05,3e-06,"zydeco, novea zydeco"
2916,Sag Ihnen Bescheid,Sag Ihnen Bescheid,Headliners,216.89424,0.530631,0.326236,2005,0.004224,0.001162,"Deutschrap, hip hop, german, Hip-Hop, rap"
3354,Lotion (LP Version),Around The Fur,Deftones,237.73995,0.821697,0.60173,1997,0.001156,0.000168,"Nu Metal, alternative, hard rock, alternative ..."
8322,You Know What They Do To Guys Like Us In Priso...,Three Cheers For Sweet Revenge,My Chemical Romance,173.73995,0.864868,0.766076,2004,0.000475,0.000199,"punk, emo, super"


In [114]:
# lets see how it compares to sample user
sample_user.where(sample_user['artist'].isin(top_5.index)).dropna()

Unnamed: 0,songID,artist,song,link,text,userID,listeners,playcount,tags
5,29811.0,Diana Ross,Love On The Line,/d/diana+ross/love+on+the+line_20040142.html,When I don't see you \r\nAnd I don't know whe...,0.0,3682.0,8541.0,"soul, female vocalists, Disco, pop, love songs"
7,40093.0,Kenny Rogers,Love Me Tender,/k/kenny+rogers/love+me+tender_20248721.html,Love me tender \r\nLove me sweet \r\nNever l...,0.0,1188.0,3911.0,"country, soft"
62,34076.0,Glee,Hold On,/g/glee/hold+on_21079209.html,I know this pain \r\nWhy do lock yourself up ...,0.0,808.0,7780.0,"epic, Favorite, glee, Demi Lovato, Adam lambert"
66,10473.0,Kenny Rogers,Love Will Turn You Around,/k/kenny+rogers/love+will+turn+you+around_2025...,You can run you can hide never let it inside ...,0.0,21232.0,71368.0,"country, male vocals, turning around, 80s, ken..."
117,46868.0,One Direction,Change Your Ticket,/o/one+direction/change+your+ticket_21090568.html,HARRY: \r\nWatching you get dressed \r\nMess...,0.0,132418.0,1052116.0,"indie pop, indie rock, the 1975, one direction..."
127,997.0,Backstreet Boys,Shining Star,/b/backstreet+boys/shining+star_20011554.html,[Chorus] \r\nYou're my shining star \r\nThat...,0.0,32959.0,136029.0,"pop, backstreet boys, american, male vocalists..."
145,29811.0,Diana Ross,Love On The Line,/d/diana+ross/love+on+the+line_20040142.html,When I don't see you \r\nAnd I don't know whe...,0.0,3682.0,8541.0,"soul, female vocalists, Disco, pop, love songs"
165,34032.0,Glee,Control,/g/glee/control_20989104.html,"This is a story about control, \r\nMy control...",0.0,18462.0,94090.0,"cover, pop, glee, Darren Criss, dianna agron"
179,46881.0,One Direction,Home,/o/one+direction/home_21103642.html,[Verse 1: Liam] \r\nMake a little conversatio...,0.0,123436.0,1302811.0,"pop, Ballad, love at first listen, larry song"
199,24471.0,Backstreet Boys,Give Me Your Heart,/b/backstreet+boys/give+me+your+heart_20011603...,[Repeat x4] \r\nShoo do do op \r\nShoo do do...,0.0,2715.0,12762.0,"pop, soul, rnb, dance pop, 1990s"
