In [1]:
import pandas as pd
import numpy as np
import nltk
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
df = pd.read_csv("spotify_millsongdata.csv")

# Keep only required columns
df = df[['song', 'artist', 'text']]
df.dropna(inplace=True)

df.head()


Unnamed: 0,song,artist,text
0,Ahe's My Kind Of Girl,ABBA,"Look at her face, it's a wonderful face \r\nA..."
1,"Andante, Andante",ABBA,"Take it easy with me, please \r\nTouch me gen..."
2,As Good As New,ABBA,I'll never know why I had to go \r\nWhy I had...
3,Bang,ABBA,Making somebody happy is a question of give an...
4,Bang-A-Boomerang,ABBA,Making somebody happy is a question of give an...


In [4]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)


In [8]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [9]:
df['clean_lyrics'] = df['text'].apply(clean_text)

df[['song', 'artist', 'clean_lyrics']].head()



Unnamed: 0,song,artist,clean_lyrics
0,Ahe's My Kind Of Girl,ABBA,look face wonderful face means something speci...
1,"Andante, Andante",ABBA,take easy please touch gently like summer even...
2,As Good As New,ABBA,ill never know go put lousy rotten show boy to...
3,Bang,ABBA,making somebody happy question give take learn...
4,Bang-A-Boomerang,ABBA,making somebody happy question give take learn...


In [15]:
df['clean_lyrics'] = df['clean_lyrics'].str[:3000]


In [24]:
vectorizer = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=2,
    max_df=0.8
)

X = vectorizer.fit_transform(df['clean_lyrics'])

print("Model trained. Shape:", X.shape)




Model trained. Shape: (57650, 10000)


In [25]:
import pickle

with open("tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

with open("lyrics_vectors.pkl", "wb") as f:
    pickle.dump(X, f)


In [26]:
def predict_song(lyrics_snippet):
    snippet_clean = clean_text(lyrics_snippet)
    snippet_vector = vectorizer.transform([snippet_clean])

    similarity_scores = cosine_similarity(snippet_vector, X)
    best_index = similarity_scores.argmax()

    return {
        "Song Title": df.iloc[best_index]['song'],
        "Artist": df.iloc[best_index]['artist'],
        "Similarity Score": similarity_scores[0][best_index]
    }


In [27]:
test_snippet = "hello darkness my old friend"
result = predict_song(test_snippet)

print(result)


{'Song Title': 'Hello My Old Friend', 'Artist': 'Electric Light Orchestra', 'Similarity Score': np.float64(0.6926617917644478)}


In [28]:
import random

correct = 0
tests = 100

for _ in range(tests):
    idx = random.randint(0, len(df)-1)
    lyrics = df.iloc[idx]['text']

    if len(lyrics) < 300:
        continue

    start = random.randint(0, len(lyrics)-200)
    snippet = lyrics[start:start+200]

    prediction = predict_song(snippet)

    if prediction['Song Title'] == df.iloc[idx]['song']:
        correct += 1

accuracy = (correct / tests) * 100
print(f"Improved Accuracy: {accuracy:.2f}%")



Improved Accuracy: 79.00%
