In [6]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [7]:
spotify_file_path ="/content/spotify_songs_dataset.csv"
data = pd.read_csv(spotify_file_path)

In [8]:
# from google.colab import drive
# drive.mount('/content/drive')

In [9]:
data.head(4)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...


In [10]:
data.shape

(57650, 4)

In [11]:
data.isnull().sum()

Unnamed: 0,0
artist,0
song,0
link,0
text,0


In [12]:
data[data['text'].isnull()].head()

Unnamed: 0,artist,song,link,text


In [13]:
data[data['text'] == ''].head()

Unnamed: 0,artist,song,link,text


In [14]:
artist_counts = data['artist'].value_counts()
unique_artists = artist_counts.index

In [15]:
print(unique_artists)
print(len(unique_artists))

Index(['Donna Summer', 'Gordon Lightfoot', 'Bob Dylan', 'George Strait',
       'Loretta Lynn', 'Cher', 'Alabama', 'Reba Mcentire', 'Chaka Khan',
       'Dean Martin',
       ...
       'Exo', 'Soundtracks', 'Various Artists', 'Exo-K', 'Ungu', 'Zazie',
       'Zed', 'Zoe', 'X-Treme', 'U-Kiss'],
      dtype='object', name='artist', length=643)
643


## **NLP APPROACH**

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import re

In [17]:
data['text'] = data['text'].str.lower().astype(str)

In [18]:
data['text'].head()

Unnamed: 0,text
0,"look at her face, it's a wonderful face \nand..."
1,"take it easy with me, please \ntouch me gentl..."
2,i'll never know why i had to go \nwhy i had t...
3,making somebody happy is a question of give an...
4,making somebody happy is a question of give an...


In [19]:
lyrics = data['text'].fillna("").tolist()
song_titles = data['song'].tolist()
artists = data['artist'].tolist()

In [22]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [23]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = text.split()  # Tokenize text
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return " ".join(tokens) # Joining the tokens

In [24]:
cleaned_lyrics = [preprocess_text(lyric) for lyric in lyrics]

#### **Jaccard Similarity**
Jaccard Similarity compares two text samples by treating them as sets of words.

**Formula:**
J(A,B)= ∣A∪B∣ / ∣A∩B∣​

where A and B are the two sets being compared, ∣A∩B∣ is the size of the intersection of A and B, and ∣A∪B∣ is the size of the union of A and B.

In [25]:
def jaccard_similarity(text1, text2):
    words_text1 = set(text1.split())
    words_text2 = set(text2.split())
    intersection = words_text1.intersection(words_text2)
    union = words_text1.union(words_text2)
    return len(intersection) / len(union)

In [26]:
# Convert lyrics into TF-IDF vectors
vectorizer = TfidfVectorizer(ngram_range=(1,3))  # Using unigrams, bigrams, and trigrams
tfidf_matrix = vectorizer.fit_transform(cleaned_lyrics)


#### Steps for finding match
- Preprocess the Input Lyric
- Convert the Processed Lyric to a TF-IDF Vector
- Compute Cosine Similarity Between Input & All Song Lyrics
- Compute Jaccard Similarity for Each Song in Dataset
- Combine Cosine & Jaccard Scores
- Find the Top-k Most Similar Songs (Sorting & Selection)
- Print the Most Similar Song(s)

In [39]:
def find_matching_song(input_lyric, top_k=1):
    input_lyric = preprocess_text(input_lyric)  # Preprocess input text
    input_vector = vectorizer.transform([input_lyric])  # Convert to TF-IDF vector

    # Compute cosine similarity
    cosine_scores = cosine_similarity(input_vector, tfidf_matrix).flatten()

    # Compute Jaccard similarity
    jaccard_scores = np.array([jaccard_similarity(input_lyric, song_lyric) for song_lyric in cleaned_lyrics])

    # Combine Cosine & Jaccard Scores (Weighted)
    combined_scores = 0.7 * cosine_scores + 0.3 * jaccard_scores

    # Get top-k most similar songs
    top_indices = combined_scores.argsort()[-top_k:][::-1]

    # Print
    for idx in top_indices:
        print(f"Matched: {song_titles[idx]} by {artists[idx]} (Confidence: {combined_scores[idx]:.2f})")

In [40]:
query = "sing us a song you're the piano man sing us a song tonight"
find_matching_song(query)

Matched: Piano Man by Billy Joel (Confidence: 0.33)


## =================================================================

## **BM25 - Best Matching 25**

In [None]:
!pip install rank_bm25

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [29]:
from collections import defaultdict
from rank_bm25 import BM25Okapi

In [30]:
lyrics = data['text'].fillna("").apply(preprocess_text).tolist()
song_titles = data['song'].tolist()
artists = data['artist'].tolist()

In [31]:
tokenized_corpus = [lyric.split() for lyric in lyrics]

In [32]:
bm25 = BM25Okapi(tokenized_corpus, k1=1.5, b=0.75)
print("BM25 initialized, corpus size:", len(tokenized_corpus))

BM25 initialized, corpus size: 57650


In [35]:
def find_matching_song_BM(input_lyric, top_k=1):
    cleaned_lyric = preprocess_text(input_lyric) # Preprocessing
    tokenized_query = cleaned_lyric.split()
    scores = bm25.get_scores(tokenized_query)  # Get BM25 scores
    top_indices = np.argsort(scores)[::-1][:top_k]  # Sort and get top k
    for idx in top_indices:
        confidence = min(scores[idx] / max(scores.max(), 1), 1)  # Normalize to 0-1
        print(f"Matched: {song_titles[idx]} by {artists[idx]} (Confidence: {confidence:.2f})")

In [None]:
query1 = "sing us a song you're the piano man sing us a song tonight"
find_matching_song_BM(query1)

Matched: Ahe's My Kind Of Girl by ABBA (Confidence: 1.00)


In [38]:
query2 = "She's just my kind of girl, she makes me feel fine Who could ever believe that"
find_matching_song_BM(query2)

Matched: Ahe's My Kind Of Girl by ABBA (Confidence: 1.00)
