In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.metrics.pairwise import cosine_similarity


In [3]:
# Load main Spotify feature dataset used as input for the model
spotify_df = pd.read_csv("data/spotify_df_cleaned.csv")

# Load recommendation metadata
recommendationInfo_df = pd.read_csv("data/recommendation_info.csv")

In [4]:
# Neural network that maps song feature vectors to an embedded vector
class MusicEmbeddingNet(nn.Module):
    def __init__(self, input_dim, embedding_dim=128):
        super(MusicEmbeddingNet, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            
            nn.Linear(128, embedding_dim),
            nn.ReLU()
        )
        
    def forward(self, x):
        return self.encoder(x)

In [5]:
# Extract the feature matrix and initliaze the embedding model
X = spotify_df.values
input_dim = X.shape[1]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MusicEmbeddingNet(input_dim, embedding_dim=128).to(device)

X_tensor = torch.FloatTensor(X).to(device)

In [6]:
# Put the model in evaluation mode and log the shape of the resulting embedding matrix; number of songs and embedding dimensions)
model.eval()
with torch.no_grad():
    embeddings = model(X_tensor).cpu().numpy()

print(f"Neural Network model initialized. Embedding shape: {embeddings.shape}")


Neural Network model initialized. Embedding shape: (32828, 128)


In [7]:
# Given a list of songs/artists, return the top reccommended songs
def recommend_song(song_list, n_recs=10):
    seed_indices = []

    # Find indices of input songs
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)
        else:
            print(f"Song not found: {track_name} — {artist_name}")

    if len(seed_indices) == 0:
        return "No valid input songs found in dataset."

    # Average seed song embeddings to create query vector
    query_embedding = embeddings[seed_indices].mean(axis=0).reshape(1, -1)

    # Calculate cos similarity between songs
    similarities = cosine_similarity(query_embedding, embeddings)[0]
    
    # Get top similar songs
    top_indices = np.argsort(similarities)[::-1]
    
    # Remove duplicates from recommendations
    filtered = []
    seen = set()
    

    for idx in top_indices:
        # Skip songs that are part of the original input list
        if idx not in seed_indices:
            # Identify song by artist/track name pair
            song_id = (
                recommendationInfo_df.loc[idx, "track_artist"],
                recommendationInfo_df.loc[idx, "track_name"]
            )
            # Only keep songs we haven't added before
            if song_id not in seen:
                seen.add(song_id)
                filtered.append(idx)
            # Stop when we've collected the specifief number of recommendations
            if len(filtered) == n_recs:
                break

    # Return a cleaned DataFrame containing only the recommended tracks           
    return recommendationInfo_df.iloc[filtered].reset_index(drop=True)

In [9]:
songs = [
    ("Everybody Wants to Rule the World", "Tears for Fears"),
]

# Print input songs
print("\nInput Songs:")
for title, artist in songs:
    print(f"- {title} — {artist}")

print("\nRecommended Songs:")

# Generate top 10 recommended songs and display them
recommended_song_list = recommend_song(songs, 10)
print(recommended_song_list)


Input Songs:
- Everybody Wants to Rule the World — Tears for Fears

Recommended Songs:
        track_artist                         track_name  \
0    Tears For Fears  Everybody Wants To Rule The World   
1      Bruce Hornsby                      The Way It Is   
2        David Bowie        Rebel Rebel - 2016 Remaster   
3        Bad Company     Ready for Love - 2015 Remaster   
4        The Buggles        Video Killed The Radio Star   
5        Cheap Trick                          The Flame   
6          Chris Rea                      Ace of Hearts   
7  Bruce Springsteen                     Hello Sunshine   
8     Spandau Ballet                 TRUE - Single Edit   
9        Bad English               When I See You Smile   

                                  track_album_name  
0  Songs From The Big Chair (Super Deluxe Edition)  
1                                    The Way It Is  
2           Diamond Dogs (2016 Remastered Version)  
3                             Bad Company (Deluxe)

In [10]:
# Compute similarity scores between the given songs and the recommended songs
def cosine_similarity_score(song_list, recommended_song_list):
    seed_indices = []

    # Locate each track/artist_name in the recommendation song set
    for track_name, artist_name in song_list:
        t = track_name.lower()
        a = artist_name.lower()

        mask = (
            recommendationInfo_df["track_name"].str.lower() == t
        ) & (
            recommendationInfo_df["track_artist"].str.lower() == a
        )

        # If a match is found store the  index
        if mask.sum() > 0:
            idx = recommendationInfo_df[mask].index[0]
            seed_indices.append(idx)

    # Average embedding of all given seed songs to build a single query vector and embeddings of the recommended songs
    query_embedding = embeddings[seed_indices].mean(axis=0).reshape(1, -1)
    rec_embeddings = embeddings[recommended_song_list.index]

    # Return similarity scores
    scores = cosine_similarity(query_embedding, rec_embeddings)
    return scores.flatten()

# Calculate similarity scores for the given songs and recommendation set
similarity_scores = cosine_similarity_score(songs, recommended_song_list)
print("\nCosine Similarity Scores:")
print(similarity_scores)


Cosine Similarity Scores:
[0.662156   0.6755989  0.63919795 0.66537225 0.6286304  0.66951275
 0.6313503  0.6048805  0.62222064 0.66491264]
