In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import TruncatedSVD
from tqdm import tqdm
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [10]:
# Load and preprocess the data
data = pd.read_csv('cleaned_df.csv')
scaler = MinMaxScaler()

# Create ID's
playlist_encoder = LabelEncoder()
track_encoder = LabelEncoder()

data['playlist_encoded'] = playlist_encoder.fit_transform(data['playlist_name'])
data['track_encoded'] = track_encoder.fit_transform(data['track_name'])

clean_sorted = data.sort_values(by=['playlist_name', 'date_added'], ascending=[True, False])

# Number of songs to test_set
m = 1

# Create test and train set
test_set = clean_sorted.groupby('playlist_name').head(m).reset_index(drop=True)
train_set = clean_sorted.groupby('playlist_name').apply(lambda x: x.iloc[m:]).reset_index(drop=True)

test_set_filtered = test_set.drop(columns=[
    'playlist_name', 'track_name', 'count', 'artists', 'album_name', 
    'explicit', 'key', 'mode', 'speechiness', 'time_signature', 
    'track_genre', 'artist_name', 'date_added', 'ones', 'track_encoded'
])
test_set_filtered = test_set_filtered.set_index('playlist_encoded')
test_set_filtered_normalized = scaler.fit_transform(test_set_filtered)

# Create the interaction matrix where the rows are the playlists and the columns are the songs
interaction_matrix = train_set.pivot_table(index='playlist_encoded', columns='track_encoded', values='ones', aggfunc='sum', fill_value=0)

# SVD to get the components 
svd = TruncatedSVD(n_components=10) 
latent_matrix = svd.fit_transform(interaction_matrix)

def get_best_new_song_per_playlist(interaction_matrix, svd):
    """
    Given Latent factorization, returns a list where the i-th element is the recommended song for the i-th playlist.
    """
    latent_matrix = svd.transform(interaction_matrix)
    predicted_scores = latent_matrix.dot(svd.components_)  

    best_songs = [None] * len(interaction_matrix)  

    for playlist_pos, playlist_idx in enumerate(interaction_matrix.index):
        existing_songs = np.nonzero(interaction_matrix.loc[playlist_idx].to_numpy())[0]
        masked_scores = np.ma.masked_array(predicted_scores[playlist_pos], mask=np.isin(np.arange(predicted_scores.shape[1]), existing_songs))
        best_song_idx = np.argmax(masked_scores)
        best_songs[playlist_pos] = best_song_idx

    return best_songs

# Recommended songs
recommended_songs = get_best_new_song_per_playlist(interaction_matrix, svd)

prediction = data.groupby('track_encoded').first().loc[recommended_songs].drop(columns=[
    'playlist_name', 'track_name', 'count', 'artists', 'album_name', 
    'explicit', 'key', 'mode', 'speechiness', 'time_signature', 
    'track_genre', 'artist_name', 'date_added', 'ones', 'playlist_encoded'
])

prediction_normalized = scaler.transform(prediction)

# Calculating the Loss
def loss_function_sing_rmse(predicted, song):
    """
    Given a recommended song and a song, returns the calculated RMSE given song properties.
    """
    return np.sqrt(np.mean((predicted - song) ** 2))

# Function to calculate the model loss as a whole
def model_loss(list_prediction, list_songs, loss_function):
    """
    Given a list of recommended songs and a list of songs, returns the calculated RMSE given song properties.
    """
    loss = 0
    for i in range(len(list_prediction)):
        loss += loss_function(list_prediction[i], list_songs[i])
    return np.sqrt(loss / len(list_prediction))

# Actual model RMSE
print(f'Model RMSE: {model_loss(prediction_normalized, test_set_filtered_normalized, loss_function_sing_rmse)}')


Model RMSE: 0.5425499683836216


In [14]:
data
data['track_genre']

0           acoustic
1        alternative
2           alt-rock
3        alternative
4           alt-rock
            ...     
50581       alt-rock
50582       alt-rock
50583       acoustic
50584       afrobeat
50585       acoustic
Name: track_genre, Length: 50586, dtype: object

In [16]:
best_songs

[np.int64(601),
 np.int64(868),
 np.int64(809),
 np.int64(726),
 np.int64(868),
 np.int64(1408),
 np.int64(314),
 np.int64(1036),
 np.int64(824),
 np.int64(604),
 np.int64(601),
 np.int64(601),
 np.int64(589),
 np.int64(868),
 np.int64(1036),
 np.int64(1573),
 np.int64(583),
 np.int64(1036),
 np.int64(594),
 np.int64(601),
 np.int64(601),
 np.int64(454),
 np.int64(1083),
 np.int64(661),
 np.int64(1372),
 np.int64(601),
 np.int64(274),
 np.int64(1049),
 np.int64(1036),
 np.int64(980),
 np.int64(1603),
 np.int64(868),
 np.int64(1036),
 np.int64(1418),
 np.int64(274),
 np.int64(274),
 np.int64(1487),
 np.int64(594),
 np.int64(274),
 np.int64(824),
 np.int64(824),
 np.int64(1036),
 np.int64(974),
 np.int64(1036),
 np.int64(868),
 np.int64(568),
 np.int64(1408),
 np.int64(668),
 np.int64(314),
 np.int64(594),
 np.int64(601),
 np.int64(1566),
 np.int64(274),
 np.int64(594),
 np.int64(568),
 np.int64(1372),
 np.int64(1573),
 np.int64(974),
 np.int64(568),
 np.int64(1573),
 np.int64(589),
 np.