# Playlist Maker
## Joseph Moore
## Anirud Shrestha

# Load in the data

In [1]:
import kagglehub

# Get lyrics dataset from Kaggle
lyrics_path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information") + '/song_lyrics.csv'

print("Path to lyrics dataset:", lyrics_path)

# Get playlists dataset from Kaggle
playlists_path = kagglehub.dataset_download("asifsadmine/spotify-playlists-dataset") + '/spotify_dataset.csv'

print("Path to lyrics dataset:", playlists_path)

Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1/song_lyrics.csv
Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\asifsadmine\spotify-playlists-dataset\versions\1/spotify_dataset.csv


In [2]:
import pandas as pd

# Load in to dataframe
df_lyrics = pd.read_csv(lyrics_path)
df_lyrics.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [3]:
# Load in to dataframe
df_playlists = pd.read_csv(playlists_path, on_bad_lines='skip')
df_playlists.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


# Clean the data

In [4]:
import re

def clean_lyrics(input_str):
    # Remove content within square brackets, including brackets
    no_brackets = re.sub(r'\[.*?\]', '', input_str)
    # Remove all newline characters
    cleaned = no_brackets.replace('\n', ' ')
    return cleaned

df_lyrics['lyrics'] = [clean_lyrics(lyrics) for lyrics in df_lyrics['lyrics']]

In [5]:
df_lyrics['lyrics']

0           Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...
1             Yeah, hah, yeah, Roc-A-Fella We invite you ...
2          Maybe cause I'm eatin And these bastards fiend...
3             Ugh, Killa! Baby! Kanye, this that 1970s He...
4           So they ask me "Young boy What you gon' do th...
                                 ...                        
5134851     Dance for me now Keeping yourself moving You'...
5134852      Ja, ja    R-A-H, Merhaba, alles was ich mach...
5134853     Here our purpose feels alive We are more than...
5134854    Jestem CEO w tym To jara twoją bitch Nikt na m...
5134855     You need a new number, one that ain't burned ...
Name: lyrics, Length: 5134856, dtype: object

In [6]:
# Rename all columns at once
df_playlists.columns = ['user_id', 'artist', 'title', 'playlist_name']

In [7]:
df_merged = pd.merge(df_playlists, df_lyrics, on=['title', 'artist'], how='inner')
df_merged.head()

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,rock,1977,32827,{},"Oh, it's so funny to be seeing you after so l...",457424,en,en,en
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,pop,2021,434,{},I'm gonna break down the walls That keep us f...,1976715,en,en,en
2,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,pop,2007,167,{},It's just me and my puppy Together we're so ve...,1846022,en,en,en
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010,rock,2007,1761,{},Everybody gonna dance tonight Everybody gonna...,387833,en,en,en
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Everywhere I Go,HARD ROCK 2010,pop,2010,9454,{},And I'll fall on my knees Tell me how's the wa...,934327,en,en,en


In [None]:
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# tokenizer = BertTokenizer.from_pretrained('distilbert-base-uncased')
# model = BertModel.from_pretrained('distilbert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

def encode_text(batch):
    """
    Encodes a line of text into a vector
    """
    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    vectors = []
    with torch.no_grad():
        outputs = model(**inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        vectors.append(cls_embeddings.cpu())

    return torch.cat(vectors, dim=0)

cuda


In [54]:
sampled_df = df_merged.sample(frac=0.001)
print(len(sampled_df))
sampled_df.head()

4770


Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language
763100,6c2c592733cdd41a8de7527deccd4603,Taylor Swift,I Know Places,Favourites,pop,2014,242314,{},"I, I, I, I, I, I, I, I-I I, I, I, I, I, I, I...",551086,en,en,en
4037526,bdcf539e8e77b705259c1110db88ce9b,Led Zeppelin,Thank You,Dios salve a la Reina?,rock,1969,153161,{},If the sun refused to shine I would still be ...,107991,en,en,en
50445,e4bb63911a09261dcea90cc89c4132af,Iron & Wine,Free Until They Cut Me Down,Starred,rock,2004,3617,{},When the men take me to the devil tree I will...,133028,en,en,en
3231744,923c7ff3ade0af8f4a32d43720d17620,Eagles,Take It Easy,Chill Tunes,rock,1972,779490,{},"Well, I'm running down the road tryin' to loo...",106895,en,en,en
1271753,4a92c6735b0ffd41572a58b215c2bc85,Timbuktu,Alla vill till himmelen men ingen vill dö,For Nanda,rap,2004,49224,{},För man vill ha vad man vill ha och man vill ...,472686,sv,sv,sv


In [None]:
from tqdm.notebook import tqdm

pos_vectors = []

playlist_names = sampled_df['playlist_name'].values
titles = sampled_df['title'].values

batch_size = 64
for i in tqdm(range(0, len(sampled_df), batch_size)):
    playlist_vec = encode_text(list(playlist_names[i:i+batch_size]))
    title_vec = encode_text(list(titles[i:i+batch_size]))
    for j in range(len(playlist_vec)):
        pos_vectors.append(playlist_vec[j] + title_vec[j])

print(pos_vectors[0])

  0%|          | 0/75 [00:00<?, ?it/s]

[[-0.20668116211891174, 0.2969786524772644, -0.13368602097034454, -0.055402640253305435, -0.1357887089252472, -0.05139589309692383, 0.23789294064044952, 0.25488924980163574, -0.24373409152030945, -0.0815589427947998, 0.08518191426992416, 0.1344822198152542, 0.09451105445623398, 0.2085465043783188, 0.06526067852973938, 0.002059415215626359, -0.5037162899971008, 0.4684324562549591, 0.11005610227584839, -0.160478875041008, 0.1276787519454956, -0.12281467765569687, -0.14652664959430695, -0.1480330377817154, 0.047277193516492844, -0.11847414821386337, -0.22825273871421814, 0.18489311635494232, 0.03334340825676918, -0.04706549271941185, 0.13200797140598297, 0.09680734574794769, -0.05280419439077377, 0.0993162989616394, -0.10184910148382187, 0.025783322751522064, 0.15711326897144318, -0.23064792156219482, -0.01421539019793272, 0.035523999482393265, 0.07469343394041061, -0.05254284664988518, 0.13089552521705627, 0.06684646755456924, -0.14707061648368835, -0.24223299324512482, -1.50474858283996

In [None]:
def get_negative_examples(df, current_playlist, n=3, random_state=42):
    """
    Returns n negative examples (rows) where playlist_name != current_playlist
    """
    negative_df = df[df['playlist_name'] != current_playlist]
    return negative_df.sample(n=n, random_state=random_state)

In [73]:
import random

# Step 1: Bucket rows by playlist
#buckets = {k: v.reset_index(drop=True) for k, v in sampled_df.groupby('user_id', 'playlist_name')}

# Step 2: Create a list of all playlist names
#all_playlists = list(buckets.keys())

# Step 3: Build negatives efficiently
neg_sample_titles: list[str] = []
neg_sample_playlists = []

for idx, row in tqdm(sampled_df.iterrows(), total=len(sampled_df)):
    current_playlist = row['playlist_name']
    samples = get_negative_examples(sampled_df, current_playlist)
    for title in samples['title']:
        neg_sample_titles.append(title)
        neg_sample_playlists.append(current_playlist)
    # current_playlist = row['playlist_name']
    
    # # Choose other playlists (negatives)
    # other_playlists = [p for p in all_playlists if p != current_playlist]
    # sampled_playlists = random.sample(range(len(other_playlists)), k=3)  # 3 negative playlists

    # # From each playlist, randomly pick one row
    # for pl in sampled_playlists:
    #     neg_row = buckets[pl].sample(n=1).iloc[0]
    #     neg_sample_titles[]
    #     neg_samples.append({
    #         'anchor_title': row['title'],
    #         'anchor_playlist': current_playlist,
    #         'negative_title': neg_row['title'],
    #         'negative_playlist': neg_row['playlist_name']
    #     })

neg_vectors = []
for i in tqdm(range(0, len(neg_sample_titles), batch_size)):
    playlist_vec = encode_text(list(neg_sample_playlists[i:i+batch_size]))
    title_vec = encode_text(list(neg_sample_titles[i:i+batch_size]))
    neg_vectors.append(playlist_vec + title_vec)


  0%|          | 0/4770 [00:00<?, ?it/s]

  0%|          | 0/373 [00:00<?, ?it/s]

In [None]:
X = pos_vectors + neg_vectors
y = ([1] * len(pos_vectors)) + ([0] * len(neg_vectors))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)