In [2]:
import pandas as pd
import numpy as np
import re
import string
import pickle

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D, Dense

from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity


In [4]:
dataset = pd.read_csv(r'Spotify Million Song Dataset_exported.csv')
dataset.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [5]:
dataset.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [6]:
(dataset.isnull().sum()/dataset.shape[0])*100

artist    0.0
song      0.0
link      0.0
text      0.0
dtype: float64

In [7]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


In [8]:
dataset.describe()

Unnamed: 0,artist,song,link,text
count,57650,57650,57650,57650
unique,643,44824,57650,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,/a/abba/ahes+my+kind+of+girl_20598417.html,I just came back from a lovely trip along the ...
freq,191,35,1,6


In [9]:
dataset.shape

(57650, 4)

In [10]:
dataset.drop(columns=["link"], inplace=True)

In [11]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\d+", "", text)
    text = text.translate(str.maketrans("", "", string.punctuation))
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [12]:
dataset["clean_text"] = dataset["text"].apply(clean_text)

In [13]:
dataset["label"] = dataset["song"] + " - " + dataset["artist"]

In [14]:
label_encoder = {label: idx for idx, label in enumerate(dataset["label"].unique())}
dataset["label_id"] = dataset["label"].map(label_encoder)

num_classes = len(label_encoder)
print("Total unique songs:", num_classes)

Total unique songs: 57648


In [16]:
MAX_WORDS = 30000

tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(dataset["clean_text"])

sequences = tokenizer.texts_to_sequences(dataset["clean_text"])

In [18]:
MAX_LEN = 100

X = pad_sequences(
    sequences,
    maxlen=MAX_LEN,
    padding="post",
    truncating="post"
)

y = np.array(dataset["label_id"])

In [21]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [22]:
VOCAB_SIZE = MAX_WORDS
EMBED_DIM = 128

input_layer = Input(shape=(MAX_LEN,))
x = Embedding(VOCAB_SIZE, EMBED_DIM)(input_layer)
x = GlobalAveragePooling1D()(x)
x = Dense(128, activation="relu")(x)

embedding_model = Model(inputs=input_layer, outputs=x)
embedding_model.summary()


In [23]:
embedding_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

In [26]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GlobalAveragePooling1D

VOCAB_SIZE = MAX_WORDS
EMBED_DIM = 128
MAX_LEN = 100

input_layer = Input(shape=(MAX_LEN,))
x = Embedding(
    input_dim=VOCAB_SIZE,
    output_dim=EMBED_DIM,
    input_length=MAX_LEN
)(input_layer)

x = GlobalAveragePooling1D()(x)
embedding_model = Model(inputs=input_layer, outputs=x)
embedding_model.summary()



In [27]:
embedding_model.compile(
    optimizer="adam",
    loss="mse"
)

In [28]:
song_embeddings = embedding_model.predict(
    X,
    batch_size=256,
    verbose=1
)

[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


In [29]:
similarity_matrix = cosine_similarity(song_embeddings)

In [30]:
def identify_song(lyric_snippet, top_k=3):
    # Clean input
    snippet = clean_text(lyric_snippet)

    # Tokenize & pad
    seq = tokenizer.texts_to_sequences([snippet])
    padded = pad_sequences(seq, maxlen=MAX_LEN, padding="post")

    # Generate embedding
    snippet_embedding = embedding_model.predict(padded)

    # Cosine similarity
    similarity_scores = cosine_similarity(
        snippet_embedding, song_embeddings
    )[0]

    # Top matches
    top_indices = similarity_scores.argsort()[-top_k:][::-1]

    results = []
    for idx in top_indices:
        results.append({
            "Song": dataset.iloc[idx]["song"],
            "Artist": dataset.iloc[idx]["artist"],
            "Similarity": round(similarity_scores[idx], 3)
        })

    return results


In [31]:
identify_song(
    "hello from the other side i must have called a thousand times",
    top_k=3
)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 249ms/step


[{'Song': 'Canarsie', 'Artist': 'Frank Zappa', 'Similarity': 0.978},
 {'Song': 'Pivotal Film', 'Artist': 'Guided By Voices', 'Similarity': 0.975},
 {'Song': 'I Just Love Halloween', 'Artist': 'Halloween', 'Similarity': 0.971}]