# Playlist Maker
## Joseph Moore
## Anirud Shrestha

# Load in the data

In [2]:
import kagglehub

# Get lyrics dataset from Kaggle
lyrics_path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information") + '/song_lyrics.csv'

print("Path to lyrics dataset:", lyrics_path)

# Get playlists dataset from Kaggle
playlists_path = kagglehub.dataset_download("asifsadmine/spotify-playlists-dataset") + '/spotify_dataset.csv'

print("Path to lyrics dataset:", playlists_path)

Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1/song_lyrics.csv
Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\asifsadmine\spotify-playlists-dataset\versions\1/spotify_dataset.csv


In [3]:
import pandas as pd

# Load in to dataframe
df_lyrics = pd.read_csv(lyrics_path)
df_lyrics.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [None]:
# Load in to dataframe
df_playlists = pd.read_csv(playlists_path, on_bad_lines='skip')
df_playlists.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


# Clean the data

In [5]:
import re

def clean_lyrics(input_str):
    # Remove content within square brackets, including brackets
    no_brackets = re.sub(r'\[.*?\]', '', input_str)
    # Remove all newline characters
    cleaned = no_brackets.replace('\n', ' ')
    return cleaned

df_lyrics['lyrics'] = [clean_lyrics(lyrics) for lyrics in df_lyrics['lyrics']]

In [None]:
df_lyrics['lyrics']

0           Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...
1             Yeah, hah, yeah, Roc-A-Fella We invite you ...
2          Maybe cause I'm eatin And these bastards fiend...
3             Ugh, Killa! Baby! Kanye, this that 1970s He...
4           So they ask me "Young boy What you gon' do th...
                                 ...                        
5134851     Dance for me now Keeping yourself moving You'...
5134852      Ja, ja    R-A-H, Merhaba, alles was ich mach...
5134853     Here our purpose feels alive We are more than...
5134854    Jestem CEO w tym To jara twojÄ… bitch Nikt na m...
5134855     You need a new number, one that ain't burned ...
Name: lyrics, Length: 5134856, dtype: object

In [16]:
# Rename all columns at once
df_playlists.columns = ['user_id', 'artist', 'title', 'playlist_name']

In [19]:
df_merged = pd.merge(df_playlists, df_lyrics, on=['title', 'artist'], how='inner')
df_merged.head()

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,rock,1977,32827,{},"Oh, it's so funny to be seeing you after so l...",457424,en,en,en
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,pop,2021,434,{},I'm gonna break down the walls That keep us f...,1976715,en,en,en
2,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,pop,2007,167,{},It's just me and my puppy Together we're so ve...,1846022,en,en,en
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010,rock,2007,1761,{},Everybody gonna dance tonight Everybody gonna...,387833,en,en,en
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Everywhere I Go,HARD ROCK 2010,pop,2010,9454,{},And I'll fall on my knees Tell me how's the wa...,934327,en,en,en


In [27]:
from transformers import BertTokenizer, BertModel
import torch

In [53]:
from tqdm.notebook import tqdm


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer.encode_plus(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state

    cls_embedding = last_hidden_state[:, 0, :]

    attention_mask = inputs['attention_mask']
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
    mean_pooling = masked_embeddings.sum(1) / attention_mask.sum(1, keepdim=True)

    return [float(item) for item in cls_embedding.flatten()]

vectors = []
for i in tqdm(range(int(len(df_merged)/100))):
    playlist_vec = encode_text(df_merged['playlist_name'][i])
    title_vec = encode_text(df_merged['title'][i])
    vectors.append(playlist_vec + title_vec)

print(vectors[0])

  0%|          | 0/47704 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [40]:
def get_negative_examples(df, current_playlist, n=5, random_state=42):
    """
    Returns n negative examples (rows) where playlist_name != current_playlist
    """
    negative_df = df[df['playlist_name'] != current_playlist]
    return negative_df.sample(n=n, random_state=random_state)

In [55]:
from tqdm.notebook import tqdm
import random

In [None]:
# Step 1: Bucket rows by playlist
buckets = {k: v.reset_index(drop=True) for k, v in df_merged.groupby('playlist_name')}

# Step 2: Create a list of all playlist names
all_playlists = list(buckets.keys())

# Step 3: Build negatives efficiently
neg_samples = []

for idx, row in tqdm(df_merged.iterrows(), total=len(df_merged)):
    current_playlist = row['playlist_name']
    
    # Choose other playlists (negatives)
    other_playlists = [p for p in all_playlists if p != current_playlist]
    sampled_playlists = random.sample(other_playlists, k=3)  # 3 negative playlists

    # From each playlist, randomly pick one row
    for pl in sampled_playlists:
        neg_row = buckets[pl].sample(n=1).iloc[0]
        
        neg_samples.append({
            'anchor_title': row['title'],
            'anchor_playlist': current_playlist,
            'negative_title': neg_row['title'],
            'negative_playlist': neg_row['playlist_name']
        })

neg_vectors = []
for i in tqdm(range(int(len(neg_samples)))):
    playlist_vec = encode_text(neg_samples[i].negative_playlist)
    title_vec = encode_text(neg_samples[i].negative_title)
    neg_vectors.append(playlist_vec + title_vec)


  0%|          | 0/4770458 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
y = [1] * len(vectors)
y.append([0] * len(neg_vectors))

X = vectors + neg_vectors

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)