# Playlist Maker
## Joseph Moore
## Anirud Shrestha

# Load in the data

In [2]:
import kagglehub

# Get lyrics dataset from Kaggle
lyrics_path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information") + '/song_lyrics.csv'

print("Path to lyrics dataset:", lyrics_path)

# Get playlists dataset from Kaggle
playlists_path = kagglehub.dataset_download("asifsadmine/spotify-playlists-dataset") + '/spotify_dataset.csv'

print("Path to lyrics dataset:", playlists_path)

Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1/song_lyrics.csv
Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\asifsadmine\spotify-playlists-dataset\versions\1/spotify_dataset.csv


In [3]:
import pandas as pd

# Load in to dataframe
df_lyrics = pd.read_csv(lyrics_path)
df_lyrics.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [4]:
# Load in to dataframe
df_playlists = pd.read_csv(playlists_path, on_bad_lines='skip')
df_playlists.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


In [5]:
df_lyrics.describe()

Unnamed: 0,year,views,id
count,5134856.0,5134856.0,5134856.0
mean,2010.303,3060.939,3830088.0
std,45.01192,47309.8,2305657.0
min,1.0,0.0,1.0
25%,2009.0,22.0,1625220.0
50%,2016.0,85.0,3866618.0
75%,2019.0,448.0,5820614.0
max,2100.0,23351420.0,7882848.0


In [6]:
df_playlists.describe()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
count,12891680,12858108,12891592,12890434
unique,15918,289820,2032043,157504
top,4398de6902abde3351347b048fcdc287,Daft Punk,Intro,Starred
freq,295275,36086,6676,1337085


# Clean the data

In [7]:
import re

def clean_lyrics(input_str):
    # Remove content within square brackets, including brackets
    no_brackets = re.sub(r'\[.*?\]', '', input_str)
    # Remove all newline characters
    cleaned = no_brackets.replace('\n', ' ')
    return cleaned

df_lyrics['lyrics'] = [clean_lyrics(lyrics) for lyrics in df_lyrics['lyrics']]

In [8]:
df_lyrics['lyrics'][0:10]

0     Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...
1       Yeah, hah, yeah, Roc-A-Fella We invite you ...
2    Maybe cause I'm eatin And these bastards fiend...
3       Ugh, Killa! Baby! Kanye, this that 1970s He...
4     So they ask me "Young boy What you gon' do th...
5     Haha Uh-huh No homo (Young Mula, baby!) I say...
6     No, no, no! I told you, I lived this shit I a...
7     Killa, Dipset Man I spit that pimp talk, you ...
8     Ay yo you wonder who I are I guzzle up at the...
9     Now Lord you know, just how hard I try To liv...
Name: lyrics, dtype: object

In [9]:
# Rename all columns at once
df_playlists.columns = ['user_id', 'artist', 'title', 'playlist_name']

In [10]:
df_lyrics = df_lyrics[['title','artist','lyrics']]
df_lyrics.drop_duplicates(inplace=True)
df_lyrics.dropna(inplace=True)

In [11]:
df_playlists.drop_duplicates(inplace=True)
df_playlists.dropna(inplace=True)

In [12]:
df_merged = pd.merge(df_playlists, df_lyrics, on=['title', 'artist'], how='inner')
df_merged.head()

Unnamed: 0,user_id,artist,title,playlist_name,lyrics
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,"Oh, it's so funny to be seeing you after so l..."
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,I'm gonna break down the walls That keep us f...
2,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,It's just me and my puppy Together we're so ve...
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010,Everybody gonna dance tonight Everybody gonna...
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Everywhere I Go,HARD ROCK 2010,And I'll fall on my knees Tell me how's the wa...


In [13]:
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('gaunernst/bert-mini-uncased')
model = BertModel.from_pretrained('gaunernst/bert-mini-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

def encode_text(batch):
    """
    Encodes a line of text into a vector
    """

    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    vectors = []
    with torch.no_grad():
        outputs = model(**inputs)

    cls_embeddings = outputs.last_hidden_state[:, 0, :]
    vectors.append(cls_embeddings.cpu())

    return torch.cat(vectors, dim=0)

cuda


In [14]:
sampled_df = df_merged.sample(frac = 0.01)

playlist_names = sampled_df['playlist_name'].values
titles = sampled_df['title'].values
lyrics = sampled_df['lyrics'].values

pos_title_vectors = []
pos_lyrics_vectors = []
pos_vectors = []

batch_size = 64
for i in tqdm(range(0, len(sampled_df), batch_size)):
    playlist_vecs = encode_text(list(playlist_names[i:i+batch_size]))
    title_vecs = encode_text(list(titles[i:i+batch_size]))
    lyrics_vecs = encode_text(list(lyrics[i:i+batch_size]))
    for j in range(len(playlist_vecs)):
        pos_vectors.append(playlist_vecs[j].tolist() + title_vecs[j].tolist() + lyrics_vecs[j].tolist())

print(pos_vectors[0])

  0%|          | 0/746 [00:00<?, ?it/s]

[0.8500780463218689, 0.8893623352050781, -0.05139964073896408, 1.1737929582595825, 0.15702125430107117, -0.9461417198181152, -0.8768520355224609, 0.25261902809143066, -2.9356045722961426, -0.2374022752046585, -0.3848058879375458, -0.468397855758667, 0.5334969758987427, -0.30338144302368164, -0.2778244912624359, 0.0876455008983612, -0.48488399386405945, 0.1234649196267128, -1.2874811887741089, 0.2744855284690857, -0.030293112620711327, 1.552945613861084, -0.48623451590538025, -0.38780245184898376, 0.14743207395076752, 0.6014063954353333, -0.3828173279762268, 0.15166178345680237, 1.3177427053451538, -0.4124968945980072, -0.33330532908439636, 0.6256544589996338, -0.17904669046401978, -0.5141372680664062, 0.15089987218379974, -0.2814899981021881, -0.3251306414604187, -0.17565374076366425, -0.6771166920661926, 0.5710405111312866, -0.31719762086868286, 1.0591222047805786, 0.9301469922065735, -0.5536755919456482, 1.2488276958465576, -0.5895162224769592, -0.3146459460258484, -0.496755212545394

In [15]:
def get_negative_examples(df, current_playlist, n=1):
    """
    Returns n negative examples (rows) where playlist_name != current_playlist
    """
    samples = df.sample(n=n)
    while (current_playlist in samples['playlist_name']):
        samples = df.sample(n=n)
    return samples

In [16]:
neg_sample_titles = []
neg_sample_lyrics = []
neg_sample_playlists = []

for i in tqdm(range(len(sampled_df))):
    row = sampled_df.iloc[i]
    current_playlist = row['playlist_name']
    samples = get_negative_examples(sampled_df, current_playlist)

    sample_titles = list(samples['title'])
    sample_lyrics = list(samples['lyrics'])

    for j in range(len(samples)):
        neg_sample_titles.append(sample_titles[j])
        neg_sample_lyrics.append(sample_lyrics[j])
        neg_sample_playlists.append(current_playlist)

  0%|          | 0/47704 [00:00<?, ?it/s]

In [17]:
neg_title_vectors = []
neg_lyric_vectors = []
neg_vectors = []
for i in tqdm(range(0, len(neg_sample_titles), batch_size)):
    playlist_vecs = encode_text(list(neg_sample_playlists[i:i+batch_size]))
    title_vecs = encode_text(list(neg_sample_titles[i:i+batch_size]))
    lyric_vecs = encode_text(list(neg_sample_lyrics[i:i+batch_size]))
    for j in range(len(playlist_vecs)):
        neg_vectors.append(playlist_vecs[j].tolist() + title_vecs[j].tolist() + lyric_vecs[j].tolist())

  0%|          | 0/746 [00:00<?, ?it/s]

In [18]:
X = pos_vectors + neg_vectors
y = ([1] * len(pos_vectors)) + ([0] * len(neg_vectors))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(len(X))

95408


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

title_model = LogisticRegression(max_iter=10000, penalty='l2', C=0.001)
title_model.fit(X_train, y_train)

y_pred = title_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.46897599832302694


In [20]:
import torch
import torch.nn as nn

class SimpleBinaryClassifier(nn.Module):
    def __init__(self):
        super(SimpleBinaryClassifier, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(256 * 3, 128),   # 512 -> 128
            nn.ReLU(),
            nn.Linear(128, 64),    # 128 -> 64
            nn.ReLU(),
            nn.Linear(64, 1),      # 64 -> 1
            nn.Sigmoid()           # Binary output
        )
        
    def forward(self, x):
        return self.model(x)
    
loss_fn = nn.BCELoss()

bin_model = SimpleBinaryClassifier()
optimizer = torch.optim.Adam(bin_model.parameters(), lr=0.001)
loss_fn = nn.BCELoss()

epochs = 1000

X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

for epoch in tqdm(range(epochs)):
    bin_model.train()  # Set model to training mode

    optimizer.zero_grad()  # Clear gradients
    
    y_pred = bin_model(X_train_tensor)  # Forward pass
    loss = loss_fn(y_pred, y_train_tensor.unsqueeze(1))  # Compute loss
    
    loss.backward()  # Backward pass
    optimizer.step()  # Update weights

    #print(f'Epoch {epoch+1}/{epochs}, Loss: {loss.item():.4f}')


X_test_tensor = torch.tensor(X_test, dtype=torch.float32)

# Set model to evaluation mode
bin_model.eval()

# Run without gradients
with torch.no_grad():
    y_pred = bin_model(X_test_tensor)  # outputs probabilities

# Convert probabilities to 0 or 1
predicted_labels = (y_pred >= 0.5).float()

# (Optional) Compare to true labels
correct = len([True for idx in range(len(y_test)) if predicted_labels[idx] == y_test[idx]])
accuracy = correct / len(y_test)

print(f'Accuracy on test data: {accuracy:.4f}')


# Run without gradients
with torch.no_grad():
    y_pred = bin_model(X_train_tensor)  # outputs probabilities

# Convert probabilities to 0 or 1
predicted_labels = (y_pred >= 0.5).float()

# (Optional) Compare to true labels
correct = len([True for idx in range(len(y_train)) if predicted_labels[idx] == y_train[idx]])
accuracy = correct / len(y_train)

print(f'Accuracy on train data: {accuracy:.4f}')

  0%|          | 0/1000 [00:00<?, ?it/s]

Accuracy on test data: 0.4989
Accuracy on train data: 0.7885


## Cosine Similarity way

In [21]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np

# Download tokenizer if not done already
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
# Deduplicate for training
unique_playlists = df_merged['playlist_name'].dropna().drop_duplicates()

In [23]:
# Tokenize unique playlist titles
tokenized_playlists = [word_tokenize(title.lower()) for title in unique_playlists]

In [24]:
w2v = Word2Vec(sentences=tokenized_playlists, vector_size=100, min_count=1, window=5, workers=4)

In [25]:
def get_avg_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)
    

In [26]:
query = "Chill songs"
query_tokens = word_tokenize(query.lower())
query_vec = get_avg_vector(query_tokens, w2v)
query_vec

array([-0.42066258,  0.676485  , -0.02537536,  0.0697619 ,  0.77319676,
       -1.6023817 ,  0.14132759,  1.7284477 , -0.7481021 , -0.49754873,
       -0.30181774, -2.3202019 , -0.36047986,  0.38129795,  0.46691138,
       -0.778419  ,  0.8021773 , -0.3037146 ,  0.09185217, -1.9464877 ,
        1.5079829 , -0.11145283,  0.8390622 , -0.79898137,  0.07352323,
        0.21172328, -1.0265982 , -0.08948873, -0.6310284 , -0.3071261 ,
        0.27164203,  0.30648226, -0.08565103, -0.58427715, -0.46929744,
        1.212352  ,  0.5715098 ,  0.30763292, -0.7480915 , -0.4697941 ,
        0.02798036, -0.99698913,  0.16556397,  0.55670077, -0.20966354,
       -0.0057836 , -0.7842448 ,  0.09545594,  0.528535  ,  0.78354585,
        0.23373404, -0.9590487 , -0.30028078, -0.02695134,  0.56718314,
        0.47270888,  0.9051168 , -0.0915748 , -0.7735496 ,  0.39751354,
       -0.610865  , -0.47814453,  0.55304015,  0.27167344, -0.44748628,
        1.0186105 ,  0.49904537,  1.315412  , -0.6865101 ,  0.96

In [27]:
tokenized_all = [
    word_tokenize(str(title).lower()) if pd.notna(title) else []
    for title in df_merged['playlist_name']
]

#playlist_vecs = [get_avg_vector(tokens, w2v) for tokens in tokenized_all]

In [28]:

def get_cosine_similarity_song(query):
    query_tokens = word_tokenize(query.lower())
    query_vec = get_avg_vector(query_tokens, w2v)
    
    # Cosine similarity
    similarities = cosine_similarity([query_vec], playlist_vecs).flatten()

    # Sort with argsort
    sorted_indices = similarities.argsort()[::-1]
    df_sorted = df_merged.iloc[sorted_indices].copy()
    df_sorted['similarity'] = similarities[sorted_indices]

    return df_sorted[:5]

def get_top_cosine_similarity_for_tracks(lyrics_list, playlist_name):
    name_tokens = word_tokenize(playlist_name)
    name_vec = get_avg_vector(name_tokens, w2v)

    results = []
    for lyrics in lyrics_list:
        lyrics_tokens = word_tokenize(lyrics)
        lyrics_vec = get_avg_vector(lyrics_tokens, w2v)
        lyrics_sim = cosine_similarity([name_vec], [lyrics_vec])[0][0]
        results.append(lyrics_sim)

    return results

def get_top_cosine_similarity_for_tracks_bert(lyrics_list, playlist_name):
    name_vec = encode_text([playlist_name])

    results = []
    for lyrics in lyrics_list:
        lyrics_vec = encode_text([lyrics])
        lyrics_sim = cosine_similarity(name_vec, lyrics_vec)[0][0]
        results.append(lyrics_sim)

    return results

In [29]:
import random

unique_users = df_merged['user_id'].unique()
random_user = random.choice(unique_users)
scores = []
baselines = []
for user in tqdm(random.sample(list(unique_users), 200)):
    user_rows = df_merged[df_merged['user_id'] == user]
    if len(user_rows['playlist_name'].unique()) > 1:
        first_name = user_rows['playlist_name'].values[random.randint(0, len(user_rows) - 1)]
        in_playlist = user_rows[user_rows['playlist_name'] == first_name]
        out_playlist = user_rows[
                            (user_rows['playlist_name'] != first_name) &
                            (~user_rows['title'].isin(in_playlist['title']))
                        ]
        full_list = list(in_playlist['lyrics'].values) + list(out_playlist['lyrics'].values)
        results = get_top_cosine_similarity_for_tracks(full_list, first_name)
        zipped = list(zip(in_playlist['title'], results[0:len(in_playlist)]))
        zipped_2 = list(zip(out_playlist['title'], results[len(in_playlist):]))
        sort_zipped = sorted(zipped + zipped_2, key=lambda x: x[1])
        #sort_zipped.reverse()
        titles = [item[0] for item in sort_zipped[0:len(in_playlist)]]
        same = set(titles).intersection(set(in_playlist['title']))
        scores.append(len(same)/float(len(in_playlist)))
        baselines.append(len(in_playlist)/float(len(in_playlist) + len(out_playlist)))
print(np.mean(scores))
print(np.mean(baselines))

  0%|          | 0/200 [00:00<?, ?it/s]

0.6568897045878955
0.36818140705698793


# Hybrid Model

In [30]:
class SongData:
    def __init__(self, song_id, title, lyrics):
        self.song_id = song_id
        self.title = title
        self.lyrics = lyrics

class HybridModel:
    def __init__(self, nn_model: SimpleBinaryClassifier, nn_weight = 0.1, cos_weight = 0.9):
        self.nn_model = nn_model
        self.nn_weight = nn_weight
        self.cos_weight = cos_weight
    
    def fit_predict(self, playlist_name, song_data : list[SongData], top_n = None):
        ids = [song.song_id for song in song_data]
        titles = [song.title for song in song_data]
        lyrics = [song.lyrics for song in song_data]
        playlist_vec = encode_text([playlist_name])[0]
        title_vecs = encode_text(titles)
        lyrics_vecs = encode_text(lyrics)
        combined_vecs = []
        for i in range(len(title_vecs)):
            combined_vecs.append(playlist_vec.tolist() + title_vecs[i].tolist() + lyrics_vecs[i].tolist())
        
        combined_vecs_tensor = torch.tensor(combined_vecs, dtype=torch.float32)

        # Set model to evaluation mode
        self.nn_model.eval()

        # Run without gradients
        with torch.no_grad():
            nn_results = self.nn_model(combined_vecs_tensor).flatten()  # outputs probabilities

        # Cosine similarity
        cos_results = []
        lyrics_vecs = encode_text(lyrics)
        for lyrics_vec in lyrics_vecs:
            lyrics_sim = cosine_similarity([playlist_vec.tolist(), lyrics_vec.tolist()])[0][0]
            cos_results.append(lyrics_sim)

        results = [((nn_results[i] * self.nn_weight) + (cos_results[i] * self.cos_weight)) for i in range(len(song_data))]
        scored_songs = zip(song_data, results)
        sorted_songs = sorted(scored_songs, key=lambda x: x[1])

        # returns the sorted list of songs
        return [song for song in sorted_songs[0]][0:top_n if top_n != None else len(sorted_songs)]

In [31]:
hybrid_model = HybridModel(nn_model = bin_model)

results = hybrid_model.fit_predict("Happy", [SongData("123", "sad song", "I am so sad and unfortunate."), SongData("234","Glee", "Feeling like a joyous soul.")])
print(results[0].title)

Glee


In [32]:
import joblib

#export to file
joblib.dump(hybrid_model, 'model.joblib')

['model.joblib']