# Playlist Maker
## Joseph Moore
## Anirud Shrestha

# Load in the data

In [1]:
import kagglehub

# Get lyrics dataset from Kaggle
lyrics_path = kagglehub.dataset_download("carlosgdcj/genius-song-lyrics-with-language-information") + '/song_lyrics.csv'

print("Path to lyrics dataset:", lyrics_path)

# Get playlists dataset from Kaggle
playlists_path = kagglehub.dataset_download("asifsadmine/spotify-playlists-dataset") + '/spotify_dataset.csv'

print("Path to lyrics dataset:", playlists_path)

Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\carlosgdcj\genius-song-lyrics-with-language-information\versions\1/song_lyrics.csv
Path to lyrics dataset: C:\Users\Joseph\.cache\kagglehub\datasets\asifsadmine\spotify-playlists-dataset\versions\1/spotify_dataset.csv


In [2]:
import pandas as pd

# Load in to dataframe
df_lyrics = pd.read_csv(lyrics_path)
df_lyrics.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,Killa Cam,rap,Cam'ron,2004,173166,"{""Cam\\'ron"",""Opera Steve""}","[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...",1,en,en,en
1,Can I Live,rap,JAY-Z,1996,468624,{},"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...",3,en,en,en
2,Forgive Me Father,rap,Fabolous,2003,4743,{},Maybe cause I'm eatin\nAnd these bastards fien...,4,en,en,en
3,Down and Out,rap,Cam'ron,2004,144404,"{""Cam\\'ron"",""Kanye West"",""Syleena Johnson""}",[Produced by Kanye West and Brian Miller]\n\n[...,5,en,en,en
4,Fly In,rap,Lil Wayne,2005,78271,{},"[Intro]\nSo they ask me\n""Young boy\nWhat you ...",6,en,en,en


In [3]:
# Load in to dataframe
df_playlists = pd.read_csv(playlists_path, on_bad_lines='skip')
df_playlists.head()

Unnamed: 0,user_id,"""artistname""","""trackname""","""playlistname"""
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,(The Angels Wanna Wear My) Red Shoes,HARD ROCK 2010
1,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,"(What's So Funny 'Bout) Peace, Love And Unders...",HARD ROCK 2010
2,9cc0cfd4d7d7885102480dd99e7a90d6,Tiffany Page,7 Years Too Late,HARD ROCK 2010
3,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello & The Attractions,Accidents Will Happen,HARD ROCK 2010
4,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010


# Clean the data

In [4]:
import re

def clean_lyrics(input_str):
    # Remove content within square brackets, including brackets
    no_brackets = re.sub(r'\[.*?\]', '', input_str)
    # Remove all newline characters
    cleaned = no_brackets.replace('\n', ' ')
    return cleaned

df_lyrics['lyrics'] = [clean_lyrics(lyrics) for lyrics in df_lyrics['lyrics']]

In [5]:
df_lyrics['lyrics'][0:10]

0     Killa Cam, Killa Cam, Cam Killa Cam, Killa Ca...
1       Yeah, hah, yeah, Roc-A-Fella We invite you ...
2    Maybe cause I'm eatin And these bastards fiend...
3       Ugh, Killa! Baby! Kanye, this that 1970s He...
4     So they ask me "Young boy What you gon' do th...
5     Haha Uh-huh No homo (Young Mula, baby!) I say...
6     No, no, no! I told you, I lived this shit I a...
7     Killa, Dipset Man I spit that pimp talk, you ...
8     Ay yo you wonder who I are I guzzle up at the...
9     Now Lord you know, just how hard I try To liv...
Name: lyrics, dtype: object

In [6]:
# Rename all columns at once
df_playlists.columns = ['user_id', 'artist', 'title', 'playlist_name']

In [7]:
df_lyrics = df_lyrics[['title','artist','lyrics']]
df_lyrics.drop_duplicates(inplace=True)
df_lyrics.dropna(inplace=True)

In [8]:
df_playlists.drop_duplicates(inplace=True)
df_playlists.dropna(inplace=True)

In [9]:
df_merged = pd.merge(df_playlists, df_lyrics, on=['title', 'artist'], how='inner')
df_merged.head()

Unnamed: 0,user_id,artist,title,playlist_name,lyrics
0,9cc0cfd4d7d7885102480dd99e7a90d6,Elvis Costello,Alison,HARD ROCK 2010,"Oh, it's so funny to be seeing you after so l..."
1,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,All Be Okay,HARD ROCK 2010,I'm gonna break down the walls That keep us f...
2,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Bright Side,HARD ROCK 2010,It's just me and my puppy Together we're so ve...
3,9cc0cfd4d7d7885102480dd99e7a90d6,Paul McCartney,Dance Tonight,HARD ROCK 2010,Everybody gonna dance tonight Everybody gonna...
4,9cc0cfd4d7d7885102480dd99e7a90d6,Lissie,Everywhere I Go,HARD ROCK 2010,And I'll fall on my knees Tell me how's the wa...


In [56]:
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel
import torch

tokenizer = BertTokenizer.from_pretrained('gaunernst/bert-mini-uncased')
model = BertModel.from_pretrained('gaunernst/bert-mini-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = model.to(device)

def encode_text(batch):
    """
    Encodes a line of text into a vector
    """

    inputs = tokenizer(batch, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    vectors = []
    with torch.no_grad():
        outputs = model(**inputs)

    cls_embeddings = outputs.last_hidden_state[:, 0, :]
    vectors.append(cls_embeddings.cpu())

    return torch.cat(vectors, dim=0)

cuda


In [11]:
sampled_df = df_merged.sample(frac = 0.01)

playlist_names = sampled_df['playlist_name'].values
titles = sampled_df['title'].values
lyrics = sampled_df['lyrics'].values

pos_title_vectors = []
pos_lyrics_vectors = []
pos_vectors = []

batch_size = 64
for i in tqdm(range(0, len(sampled_df), batch_size)):
    playlist_vecs = encode_text(list(playlist_names[i:i+batch_size]))
    title_vecs = encode_text(list(titles[i:i+batch_size]))
    # lyrics_vecs = encode_text(list(lyrics[i:i+batch_size]))
    for j in range(len(playlist_vecs)):
        pos_vectors.append(playlist_vecs[j].tolist() + title_vecs[j].tolist())# + lyrics_vecs[j].tolist())

print(pos_vectors[0])

  0%|          | 0/746 [00:00<?, ?it/s]

[0.6221179366111755, 0.1559619903564453, -0.0999159961938858, -0.18995171785354614, -0.05494416505098343, -1.3493572473526, -1.5225886106491089, 0.09442138671875, -2.9157962799072266, 0.5095318555831909, -0.8919844627380371, -0.4141472280025482, -0.24264544248580933, 0.25625741481781006, 0.8635625243186951, 0.06430843472480774, 0.2854312062263489, -0.2941000163555145, -0.9147794246673584, -0.28013738989830017, -0.14950215816497803, 1.181396722793579, -0.7103758454322815, 0.25682491064071655, -0.14834371209144592, 0.19156259298324585, -0.2443944215774536, 0.2775861918926239, 1.113974928855896, -1.032404899597168, -1.0068572759628296, 1.055361270904541, -0.19889797270298004, 0.5234076976776123, 0.7205423712730408, 0.3091171681880951, -0.15540394186973572, -0.35815632343292236, 0.8608970046043396, 0.43655845522880554, -1.0591851472854614, 0.19230827689170837, 1.478803277015686, 0.10885284841060638, 0.6318399906158447, 0.3918987214565277, 0.6599422693252563, -0.6255181431770325, 0.04019214

In [14]:
def get_negative_examples(df, current_playlist, n=1):
    """
    Returns n negative examples (rows) where playlist_name != current_playlist
    """
    samples = df.sample(n=n)
    while (current_playlist in samples['playlist_name']):
        samples = df.sample(n=n)
    return samples

In [16]:
neg_sample_titles = []
neg_sample_lyrics = []
neg_sample_playlists = []

for i in tqdm(range(len(sampled_df))):
    row = sampled_df.iloc[i]
    current_playlist = row['playlist_name']
    samples = get_negative_examples(sampled_df, current_playlist)

    sample_titles = list(samples['title'])
    sample_lyrics = list(samples['lyrics'])

    for j in range(len(samples)):
        neg_sample_titles.append(sample_titles[j])
        neg_sample_lyrics.append(sample_lyrics[j])
        neg_sample_playlists.append(current_playlist)

  0%|          | 0/47704 [00:00<?, ?it/s]

In [17]:
neg_title_vectors = []
neg_lyric_vectors = []
neg_vectors = []
for i in tqdm(range(0, len(neg_sample_titles), batch_size)):
    playlist_vecs = encode_text(list(neg_sample_playlists[i:i+batch_size]))
    title_vecs = encode_text(list(neg_sample_titles[i:i+batch_size]))
    #lyric_vecs = encode_text(list(neg_sample_lyrics[i:i+batch_size]))
    for j in range(len(playlist_vecs)):
        neg_vectors.append(playlist_vecs[j].tolist() + title_vecs[j].tolist())# + lyric_vecs[j].tolist())

  0%|          | 0/746 [00:00<?, ?it/s]

In [24]:
X = pos_vectors + neg_vectors
y = ([1] * len(pos_vectors)) + ([0] * len(neg_vectors))

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

title_model = LogisticRegression(max_iter=10000)
title_model.fit(X_train, y_train)

y_pred = title_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)
print(sum(y_pred))
print(len(y_pred))

0.4708625930195996
9249
19082


In [26]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense

In [27]:
model = keras.Sequential([
    Dense(2, activation='relu', input_shape=(512,)), # Input layer with 128 neurons and ReLU activation
    Dense(1, activation='sigmoid')  # Output layer with softmax for classification
])

print('compiling')
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print('fitting')
model.fit(X_train, y_train, epochs=2, verbose=1, batch_size=32)

predictions = model.predict(X_test)

loss, accuracy = model.evaluate(X_test, y_test)
print(f"Loss: {loss:.4f}")
print(f"Accuracy: {accuracy:.4f}")

binary_predictions = (predictions > 0.5).astype(int)
accuracy = accuracy_score(binary_predictions, y_pred)
print(accuracy)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


compiling
fitting


KeyboardInterrupt: 

In [28]:
import joblib
filename = 'logistic_regression_model_joblib.pkl'
joblib.dump(model, filename)

['logistic_regression_model_joblib.pkl']

## Cosine Similarity way

In [29]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import numpy as np

# Download tokenizer if not done already
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Joseph\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [30]:
# Deduplicate for training
unique_playlists = df_merged['playlist_name'].dropna().drop_duplicates()

In [31]:
# Tokenize unique playlist titles
tokenized_playlists = [word_tokenize(title.lower()) for title in unique_playlists]

In [32]:
w2v = Word2Vec(sentences=tokenized_playlists, vector_size=100, min_count=1, window=5, workers=4)

In [33]:
def get_avg_vector(tokens, model):
    vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    return np.mean(vectors, axis=0)
    

In [34]:
query = "Chill songs"
query_tokens = word_tokenize(query.lower())
query_vec = get_avg_vector(query_tokens, w2v)
query_vec

array([-0.5234494 ,  0.64355826,  0.02636933,  0.37991285,  0.6501976 ,
       -1.6298294 ,  0.14230575,  1.8487517 , -0.902862  , -0.65073663,
       -0.26667398, -1.9376837 , -0.25658116,  0.69909835,  0.3144016 ,
       -0.7736651 ,  0.47572982, -0.10156021, -0.32301176, -1.5575962 ,
        1.279217  ,  0.36618954,  0.5707018 , -0.8233291 ,  0.3548858 ,
       -0.15026505, -0.98910195,  0.16739424, -0.33454823, -0.29526678,
        0.48546708,  0.00445797,  0.1328669 , -0.7900886 , -0.5289084 ,
        1.4921468 ,  0.7211741 ,  0.3939488 , -0.66215086, -0.2959023 ,
        0.06076444, -0.95169365, -0.09683402,  0.26281422, -0.03049245,
       -0.07783057, -0.5591881 ,  0.5622478 ,  0.56233674,  0.8041507 ,
        0.47017905, -1.1458592 , -0.33654067, -0.3816585 ,  0.59345365,
        0.23812923,  0.989583  , -0.16221492, -0.7135481 ,  0.3782798 ,
       -0.60050106, -0.2017516 ,  0.764869  , -0.04847962, -0.3460202 ,
        1.1983538 ,  0.5233567 ,  1.4562445 , -0.4536441 ,  0.77

In [35]:
tokenized_all = [
    word_tokenize(str(title).lower()) if pd.notna(title) else []
    for title in df_merged['playlist_name']
]

#playlist_vecs = [get_avg_vector(tokens, w2v) for tokens in tokenized_all]

In [54]:

def get_cosine_similarity_song(query):
    query_tokens = word_tokenize(query.lower())
    query_vec = get_avg_vector(query_tokens, w2v)
    
    # Cosine similarity
    similarities = cosine_similarity([query_vec], playlist_vecs).flatten()

    # Sort with argsort
    sorted_indices = similarities.argsort()[::-1]
    df_sorted = df_merged.iloc[sorted_indices].copy()
    df_sorted['similarity'] = similarities[sorted_indices]

    return df_sorted[:5]

def get_top_cosine_similarity_for_tracks(lyrics_list, playlist_name):
    name_tokens = word_tokenize(playlist_name)
    name_vec = get_avg_vector(name_tokens, w2v)

    results = []
    for lyrics in lyrics_list:
        lyrics_tokens = word_tokenize(lyrics)
        lyrics_vec = get_avg_vector(lyrics_tokens, w2v)
        lyrics_sim = cosine_similarity([name_vec], [lyrics_vec])[0][0]
        results.append(lyrics_sim)

    return results

def get_top_cosine_similarity_for_tracks_bert(lyrics_list, playlist_name):
    name_vec = encode_text([playlist_name])

    results = []
    for lyrics in lyrics_list:
        lyrics_vec = encode_text([lyrics])
        lyrics_sim = cosine_similarity(name_vec, lyrics_vec)[0][0]
        results.append(lyrics_sim)

    return results

In [67]:
import random

unique_users = df_merged['user_id'].unique()
random_user = random.choice(unique_users)
scores = []
baselines = []
for user in tqdm(random.sample(list(unique_users), 200)):
    user_rows = df_merged[df_merged['user_id'] == user]
    if len(user_rows['playlist_name'].unique()) > 1:
        first_name = user_rows['playlist_name'].values[random.randint(0, len(user_rows) - 1)]
        in_playlist = user_rows[user_rows['playlist_name'] == first_name]
        out_playlist = user_rows[
                            (user_rows['playlist_name'] != first_name) &
                            (~user_rows['title'].isin(in_playlist['title']))
                        ]
        full_list = list(in_playlist['lyrics'].values) + list(out_playlist['lyrics'].values)
        results = get_top_cosine_similarity_for_tracks(full_list, first_name)
        zipped = list(zip(in_playlist['title'], results[0:len(in_playlist)]))
        zipped_2 = list(zip(out_playlist['title'], results[len(in_playlist):]))
        sort_zipped = sorted(zipped + zipped_2, key=lambda x: x[1])
        #sort_zipped.reverse()
        titles = [item[0] for item in sort_zipped[0:len(in_playlist)]]
        same = set(titles).intersection(set(in_playlist['title']))
        scores.append(len(same)/float(len(in_playlist)))
        baselines.append(len(in_playlist)/float(len(in_playlist) + len(out_playlist)))
print(np.mean(scores))
print(np.mean(baselines))

  0%|          | 0/200 [00:00<?, ?it/s]

0.5606976067885431
0.29667158105184466


In [66]:
get_cosine_similarity_song("Love story")

MemoryError: Unable to allocate 3.55 GiB for an array with shape (4770458, 100) and data type float64

In [52]:
get_cosine_similarity_song("Hard rock")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
617655,6649c6b017eec9ffc91f83d6f386c1f6,Intwine,Cruel Man,Hard Rock,pop,2004,1550,{},"Sick of preaching this to you\nNobody listens,...",1329717,en,en,en,1.0
3585797,e979c447d3747b097b58a83f1fb4e894,Uriah Heep,Hot Persuasion,Hard Rock,pop,1982,102,{},You know you light my candle\nYou know you loa...,914695,en,en,en,1.0
3585799,e979c447d3747b097b58a83f1fb4e894,Godsmack,I Am,Hard Rock,rock,2003,1831,{},[Verse 1]\nI am your spoken truth\nI am the li...,273374,en,en,en,1.0
3585800,e979c447d3747b097b58a83f1fb4e894,Godsmack,I Stand Alone,Hard Rock,rock,2002,34945,{},"[Verse 1]\nI've told you this once before, you...",209485,en,en,en,1.0
3585801,e979c447d3747b097b58a83f1fb4e894,Uriah Heep,I Wanna Be Free,Hard Rock,pop,1971,829,{},[Verse 1]\nAs I wake up every day\nWith no new...,1573185,en,en,en,1.0


In [53]:
get_cosine_similarity_song("Lo-fi chill")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
1017465,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Belly Full,Chill DMB,rock,2012,2037,{},"[Verse 1]\nOh, my love\nIf I had my way\nThen ...",186079,en,en,en,0.999739
1017475,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Snow Outside,Chill DMB,rock,2012,2805,{},"[Verse 1]\nWell, you are like a secret garden\...",186113,en,en,en,0.999739
1017463,a1636d750f67c7e6754165a2f7856860,Jack Johnson,A Pirate Looks At Forty,Chill DMB,pop,2012,3211,{},"Mother, mother ocean, well I have heard you ca...",1798565,en,en,en,0.999739
1017464,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Baby Blue,Chill DMB,rock,2009,5764,{},[Verse 1]\nConfess your kiss\nStill knocks me ...,68711,en,en,en,0.999739
1017466,a1636d750f67c7e6754165a2f7856860,Dave Matthews Band,Big Eyed Fish,Chill DMB,rock,2002,7708,{},[Verse 1]\nLook at this big-eyed fish swimmin'...,187932,en,en,en,0.999739


In [54]:
get_cosine_similarity_song("rap")

Unnamed: 0,user_id,artist,title,playlist_name,tag,year,views,features,lyrics,id,language_cld3,language_ft,language,similarity
4169870,a5993960605e6586e5dffabcd30c2757,YG,Do It To Ya,Rap,rap,2014,168895,{TeeFlii},"[Intro: YG]\nHey, hey, hey, hey\nMustard on th...",381670,en,en,en,1.0
693664,57004eb5876f347f53d9d5c237e8bde6,D12,Fight Music,rap,rap,2001,244185,{},"[Chorus: Eminem]\nThis kind of music, use it, ...",1890,en,en,en,1.0
693675,57004eb5876f347f53d9d5c237e8bde6,Lil Wayne,Go DJ,rap,rap,2004,156237,"{""Mannie Fresh""}","[Intro: Mannie Fresh]\nYeah, yeah, yeah, yeah,...",250,en,en,en,1.0
693674,57004eb5876f347f53d9d5c237e8bde6,D12,Girls,rap,rap,2001,275,{},[Eminem]\nHey yo dawg\nI got some shit on my m...,7682026,en,en,en,1.0
693673,57004eb5876f347f53d9d5c237e8bde6,Busta Rhymes,Gimme Some More,rap,rap,1998,88091,{},"[Intro]\nYeah, as a shorty, playing in the fro...",9010,en,en,en,1.0
