# Playlist Maker
## Joseph Moore
## Anirud Shrestha

# Load in the data

In [1]:
import kagglehub

# Get lyrics dataset from Kaggle
lyrics_path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information") + '/song_lyrics.csv'

print("Path to lyrics dataset:", lyrics_path)

# Get playlists dataset from Kaggle
playlists_path = kagglehub.dataset_download("asifsadmine/spotify-playlists-dataset") + '/spotify_dataset.csv'

print("Path to lyrics dataset:", playlists_path)

Path to lyrics dataset: C:\Users\aniru\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1/song_lyrics.csv
Path to lyrics dataset: C:\Users\aniru\.cache\kagglehub\datasets\asifsadmine\spotify-playlists-dataset\versions\1/spotify_dataset.csv


In [2]:
import pandas as pd

# Load in to dataframe
df_lyrics = pd.read_csv(lyrics_path)
df_lyrics.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [3]:
# Load in to dataframe
df_playlists = pd.read_csv(playlists_path, on_bad_lines='skip')
df_playlists.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


# Clean the data

In [5]:
import re

def clean_lyrics(input_str):
    # Remove content within square brackets, including brackets
    no_brackets = re.sub(r'\[.*?\]', '', input_str)
    # Remove all newline characters
    cleaned = no_brackets.replace('\n', ' ')
    return cleaned

df_lyrics['lyrics'] = [clean_lyrics(lyrics) for lyrics in df_lyrics['lyrics']]

In [None]:
df_lyrics['lyrics']

0           Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...
1             Yeah, hah, yeah, Roc-A-Fella We invite you ...
2          Maybe cause I'm eatin And these bastards fiend...
3             Ugh, Killa! Baby! Kanye, this that 1970s He...
4           So they ask me "Young boy What you gon' do th...
                                 ...                        
5134851     Dance for me now Keeping yourself moving You'...
5134852      Ja, ja    R-A-H, Merhaba, alles was ich mach...
5134853     Here our purpose feels alive We are more than...
5134854    Jestem CEO w tym To jara twoją bitch Nikt na m...
5134855     You need a new number, one that ain't burned ...
Name: lyrics, Length: 5134856, dtype: object

In [16]:
# Rename all columns at once
df_playlists.columns = ['user_id', 'artist', 'title', 'playlist_name']

In [17]:
df_merged = pd.merge(df_playlists, df_lyrics, on=['title', 'artist'], how='inner')
df_merged.head()

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,rock,1977,32827,{},"[Verse 1]\nOh, it's so funny to be seeing you ...",457424,en,en,en
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,pop,2021,434,{},[Verse 1]\nI'm gonna break down the walls\nTha...,1976715,en,en,en
2,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,pop,2007,167,{},It's just me and my puppy\nTogether we're so v...,1846022,en,en,en
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010,rock,2007,1761,{},[Refrain]\nEverybody gonna dance tonight\nEver...,387833,en,en,en
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Everywhere I Go,HARD ROCK 2010,pop,2010,9454,{},And I'll fall on my knees\nTell me how's the w...,934327,en,en,en


In [27]:
from transformers import BertTokenizer, BertModel
import torch

In [53]:
from tqdm.notebook import tqdm


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def encode_text(text):
    inputs = tokenizer.encode_plus(text, return_tensors="pt", truncation=True, padding=True)

    with torch.no_grad():
        outputs = model(**inputs)
    
    last_hidden_state = outputs.last_hidden_state

    cls_embedding = last_hidden_state[:, 0, :]

    attention_mask = inputs['attention_mask']
    masked_embeddings = last_hidden_state * attention_mask.unsqueeze(-1)
    mean_pooling = masked_embeddings.sum(1) / attention_mask.sum(1, keepdim=True)

    return [float(item) for item in cls_embedding.flatten()]

vectors = []
for i in tqdm(range(int(len(df_merged)/100))):
    playlist_vec = encode_text(df_merged['playlist_name'][i])
    title_vec = encode_text(df_merged['title'][i])
    vectors.append(playlist_vec + title_vec)

print(vectors[0])

  0%|          | 0/47704 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [40]:
def get_negative_examples(df, current_playlist, n=5, random_state=42):
    """
    Returns n negative examples (rows) where playlist_name != current_playlist
    """
    negative_df = df[df['playlist_name'] != current_playlist]
    return negative_df.sample(n=n, random_state=random_state)

In [55]:
from tqdm.notebook import tqdm
import random

In [None]:
# Step 1: Bucket rows by playlist
buckets = {k: v.reset_index(drop=True) for k, v in df_merged.groupby('playlist_name')}

# Step 2: Create a list of all playlist names
all_playlists = list(buckets.keys())

# Step 3: Build negatives efficiently
neg_samples = []

for idx, row in tqdm(df_merged.iterrows(), total=len(df_merged)):
    current_playlist = row['playlist_name']
    
    # Choose other playlists (negatives)
    other_playlists = [p for p in all_playlists if p != current_playlist]
    sampled_playlists = random.sample(other_playlists, k=3)  # 3 negative playlists

    # From each playlist, randomly pick one row
    for pl in sampled_playlists:
        neg_row = buckets[pl].sample(n=1).iloc[0]
        
        neg_samples.append({
            'anchor_title': row['title'],
            'anchor_playlist': current_playlist,
            'negative_title': neg_row['title'],
            'negative_playlist': neg_row['playlist_name']
        })

neg_vectors = []
for i in tqdm(range(int(len(neg_samples)))):
    playlist_vec = encode_text(neg_samples[i].negative_playlist)
    title_vec = encode_text(neg_samples[i].negative_title)
    neg_vectors.append(playlist_vec + title_vec)


  0%|          | 0/4770458 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
y = [1] * len(vectors)
y.append([0] * len(neg_vectors))

X = vectors + neg_vectors

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

## Cosine Similarity way

In [15]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np

# Download tokenizer if not done already
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\aniru\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
# Deduplicate for training
unique_playlists = df_merged['playlist_name'].dropna().drop_duplicates()

In [28]:
# Tokenize unique playlist titles
tokenized_playlists = [word_tokenize(title.lower()) for title in unique_playlists]

In [30]:
w2v = Word2Vec(sentences=tokenized_playlists, vector_size=100, min_count=1, window=5, workers=4)

In [32]:
def get_avg_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)
    

In [35]:
query = "Chill songs"
query_tokens = word_tokenize(query.lower())
query_vec = get_avg_vector(query_tokens, w2v)
query_vec

array([-0.40211633,  0.49770546, -0.06018797,  0.47975257,  0.4409672 ,
       -1.7178718 ,  0.30247933,  1.752794  , -1.0481502 , -0.57120657,
       -0.26584294, -1.9541409 , -0.1700099 ,  0.6309916 ,  0.30916572,
       -0.8280403 ,  0.6873963 , -0.20991166, -0.10473429, -1.6494293 ,
        1.2099946 ,  0.16711217,  0.6284453 , -0.748235  ,  0.47427645,
       -0.01936241, -0.97873425,  0.0394406 , -0.23651612, -0.28884754,
        0.51354647,  0.08268362,  0.40246615, -0.89566964, -0.4600961 ,
        1.4260952 ,  0.4365483 ,  0.24312666, -0.670693  , -0.46866864,
        0.1010094 , -0.93168795, -0.01901323,  0.12683307, -0.17713012,
       -0.04141244, -0.5597807 ,  0.5631971 ,  0.2382963 ,  0.85512185,
        0.3523497 , -0.9190556 , -0.46670175, -0.27154177,  0.59139013,
        0.1684408 ,  0.9520826 , -0.13712955, -0.69848317,  0.48197204,
       -0.6475824 , -0.26139048,  0.62170494,  0.04408996, -0.4163286 ,
        1.1623037 ,  0.43397638,  1.4954872 , -0.5548661 ,  0.79

In [49]:
tokenized_all = [
    word_tokenize(str(title).lower()) if pd.notna(title) else []
    for title in df_merged['playlist_name']
]

playlist_vecs = [get_avg_vector(tokens, w2v) for tokens in tokenized_all]

In [50]:

def get_cosine_similarity_song(query):
    query_tokens = word_tokenize(query.lower())
    query_vec = get_avg_vector(query_tokens, w2v)
    
    # Cosine similarity
    similarities = cosine_similarity([query_vec], playlist_vecs).flatten()

    # Sort with argsort
    sorted_indices = similarities.argsort()[::-1]
    df_sorted = df_merged.iloc[sorted_indices].copy()
    df_sorted['similarity'] = similarities[sorted_indices]

    return df_sorted[:5]

In [51]:
get_cosine_similarity_song("Love story")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
986405,58da90ea2640a507cc4ada6441bbd214,Taylor Swift,Love Story,Love Story,country,2008,978791,{},[Verse 1]\nWe were both young when I first saw...,98867,en,en,en,1.0
4278373,658159ed7b84bf407b1edd7bf7ce30c8,Hayseed Dixie,Cat Scratch Fever,Mountain Love,pop,2002,55,{},"Well, I don't know where they come from but th...",1702826,en,en,en,0.998227
4278378,658159ed7b84bf407b1edd7bf7ce30c8,Hayseed Dixie,Walk This Way,Mountain Love,pop,2002,190,{},Backstroke lover\nAlways hidin' 'neath the cov...,1255709,en,en,en,0.998227
4278377,658159ed7b84bf407b1edd7bf7ce30c8,Hayseed Dixie,The Perfect Woman,Mountain Love,pop,2002,154,{},You know I like it so you play the game\nYou p...,1420375,en,en,en,0.998227
4278376,658159ed7b84bf407b1edd7bf7ce30c8,Hayseed Dixie,Feel Like Making Love,Mountain Love,pop,2002,37,{},"Baby, when I think about you, I think about lo...",1324261,en,en,en,0.998227


In [52]:
get_cosine_similarity_song("Hard rock")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
617655,6649c6b017eec9ffc91f83d6f386c1f6,Intwine,Cruel Man,Hard Rock,pop,2004,1550,{},"Sick of preaching this to you\nNobody listens,...",1329717,en,en,en,1.0
3585797,e979c447d3747b097b58a83f1fb4e894,Uriah Heep,Hot Persuasion,Hard Rock,pop,1982,102,{},You know you light my candle\nYou know you loa...,914695,en,en,en,1.0
3585799,e979c447d3747b097b58a83f1fb4e894,Godsmack,I Am,Hard Rock,rock,2003,1831,{},[Verse 1]\nI am your spoken truth\nI am the li...,273374,en,en,en,1.0
3585800,e979c447d3747b097b58a83f1fb4e894,Godsmack,I Stand Alone,Hard Rock,rock,2002,34945,{},"[Verse 1]\nI've told you this once before, you...",209485,en,en,en,1.0
3585801,e979c447d3747b097b58a83f1fb4e894,Uriah Heep,I Wanna Be Free,Hard Rock,pop,1971,829,{},[Verse 1]\nAs I wake up every day\nWith no new...,1573185,en,en,en,1.0


In [53]:
get_cosine_similarity_song("Lo-fi chill")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
1017465,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Belly Full,Chill DMB,rock,2012,2037,{},"[Verse 1]\nOh, my love\nIf I had my way\nThen ...",186079,en,en,en,0.999739
1017475,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Snow Outside,Chill DMB,rock,2012,2805,{},"[Verse 1]\nWell, you are like a secret garden\...",186113,en,en,en,0.999739
1017463,a1636d750f67c7e6754165a2f7856860,Jack Johnson,A Pirate Looks At Forty,Chill DMB,pop,2012,3211,{},"Mother, mother ocean, well I have heard you ca...",1798565,en,en,en,0.999739
1017464,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Baby Blue,Chill DMB,rock,2009,5764,{},[Verse 1]\nConfess your kiss\nStill knocks me ...,68711,en,en,en,0.999739
1017466,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Big Eyed Fish,Chill DMB,rock,2002,7708,{},[Verse 1]\nLook at this big-eyed fish swimmin'...,187932,en,en,en,0.999739


In [54]:
get_cosine_similarity_song("rap")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
4169870,a5993960605e6586e5dffabcd30c2757,YG,Do It To Ya,Rap,rap,2014,168895,{TeeFlii},"[Intro: YG]\nHey, hey, hey, hey\nMustard on th...",381670,en,en,en,1.0
693664,57004eb5876f347f53d9d5c237e8bde6,D12,Fight Music,rap,rap,2001,244185,{},"[Chorus: Eminem]\nThis kind of music, use it, ...",1890,en,en,en,1.0
693675,57004eb5876f347f53d9d5c237e8bde6,Lil Wayne,Go DJ,rap,rap,2004,156237,"{""Mannie Fresh""}","[Intro: Mannie Fresh]\nYeah, yeah, yeah, yeah,...",250,en,en,en,1.0
693674,57004eb5876f347f53d9d5c237e8bde6,D12,Girls,rap,rap,2001,275,{},[Eminem]\nHey yo dawg\nI got some shit on my m...,7682026,en,en,en,1.0
693673,57004eb5876f347f53d9d5c237e8bde6,Busta Rhymes,Gimme Some More,rap,rap,1998,88091,{},"[Intro]\nYeah, as a shorty, playing in the fro...",9010,en,en,en,1.0
