In [None]:
import numpy as np
import pandas as pd


# Data Preprocessing

In [20]:
songs=pd.read_csv('spotify_millsongdata.csv')

In [21]:
songs.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [22]:
songs.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [23]:
songs.duplicated().sum()

np.int64(0)

In [24]:
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [25]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shreyasharan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shreyasharan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shreyasharan/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [26]:
stop_words=set(stopwords.words('english'))
lemmatizer=WordNetLemmatizer()
def preprocess_lyrics(text):
    if pd.isna(text):
        return ""
    
    text=text.lower()
    text=text.translate(str.maketrans('','',string.punctuation)) #remove punctuation
    tokens=text.split()
    tokens=[lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

songs['clean_lyrics']=songs['text'].apply(preprocess_lyrics)

In [27]:
songs[['song','clean_lyrics']]

Unnamed: 0,song,clean_lyrics
0,Ahe's My Kind Of Girl,look face wonderful face mean something specia...
1,"Andante, Andante",take easy please touch gently like summer even...
2,As Good As New,ill never know go put lousy rotten show boy to...
3,Bang,making somebody happy question give take learn...
4,Bang-A-Boomerang,making somebody happy question give take learn...
...,...,...
57645,Good Old Days,irie day come play let angel fly let devil die...
57646,Hand To Mouth,power worker power power worker need power pow...
57647,Come With Me,need something ill believe flashlight hall cal...
57648,Desire,northern star frightened go rest cant sleep im...


In [28]:
songs=songs[['artist','song','clean_lyrics']]

# Vectorization

In [59]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(stop_words='english',max_features=5000)
tfidf_matrix=tfidf.fit_transform(songs['clean_lyrics'])

In [60]:
#KMeans groups similar lyrics together based upon word usage patterns

from sklearn.cluster import KMeans

num_clusters=8 #choose no. of clusters 
kmeans=KMeans(n_clusters=num_clusters,random_state=42)
songs['genre_cluster']=kmeans.fit_predict(tfidf_matrix)

#Now every song has a genre_cluster (0 to 7) — its auto-discovered group.

In [61]:
order_centroids =kmeans.cluster_centers_.argsort()[:,::-1]
terms=tfidf.get_feature_names_out()

print("Top words per genre cluster:\n")
for i in range(num_clusters):
    print(f"Clusters {i}: ",end='')
    top_words=[terms[ind] for ind in order_centroids[i, :10]]
    print(", ".join(top_words))


Top words per genre cluster:

Clusters 0: love, heart, know, im, dont, need, time, ill, oh, like
Clusters 1: oh, yeah, love, im, know, dont, like, come, got, time
Clusters 2: im, gonna, like, got, know, dont, aint, nigga, cause, ive
Clusters 3: dont, know, want, youre, time, say, let, im, ill, love
Clusters 4: christmas, merry, year, santa, day, tree, bell, mistletoe, snow, time
Clusters 5: like, come, time, day, love, life, night, know, im, away
Clusters 6: baby, love, dont, im, know, oh, want, come, youre, got
Clusters 7: shes, girl, got, know, like, love, shell, dont, im, woman


In [62]:
songs[songs['genre_cluster'] == 0][['song', 'artist']].head(10)


Unnamed: 0,song,artist
2,As Good As New,ABBA
3,Bang,ABBA
4,Bang-A-Boomerang,ABBA
24,Gonna Sing You My Lovesong,ABBA
28,He Is Your Brother,ABBA
30,Here We'll Stay,ABBA
36,"I Do, I Do, I Do, I Do, I Do",ABBA
50,Lay All Your Love On Me,ABBA
52,Love Has It's Ways,ABBA
53,Love Isn't Easy,ABBA


In [63]:
cluster_names = {
    0: "Love Songs",
    1: "Party Anthems",
    2: "Sad / Emotional",
    3: "Rap / Hip-Hop",
    4: "Country Vibes",
    5: "Rock / Rebellion",
    6: "Soulful / R&B",
    7: "Electronic / Dance"
}

songs['genre_label'] = songs['genre_cluster'].map(cluster_names)


In [34]:
songs[['song','artist','genre_label']].head(10)

Unnamed: 0,song,artist,genre_label
0,Ahe's My Kind Of Girl,ABBA,Rock / Rebellion
1,"Andante, Andante",ABBA,Electronic / Dance
2,As Good As New,ABBA,Love Songs
3,Bang,ABBA,Love Songs
4,Bang-A-Boomerang,ABBA,Love Songs
5,Burning My Bridges,ABBA,Country Vibes
6,Cassandra,ABBA,Party Anthems
7,Chiquitita,ABBA,Soulful / R&B
8,Crazy World,ABBA,Soulful / R&B
9,Crying Over You,ABBA,Rock / Rebellion


In [64]:
songs['genre_cluster'] = kmeans.fit_predict(tfidf_matrix)

# (Optional) human-friendly labels
songs['genre_label'] = songs['genre_cluster'].map(cluster_names)


In [65]:

from sklearn.neighbors import NearestNeighbors
knn_model = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
knn_model.fit(tfidf_matrix)  # This is the crucial missing step

In [66]:
# Take a smaller sample of your data for prototyping
# Create a sample - make sure it's not empty
songs_sample = songs.dropna().sample(n=20000, random_state=42) if len(songs) > 20000 else songs.copy()
tfidf_matrix_sample = tfidf.transform(songs_sample['clean_lyrics'])

# Recommendation Model

In [67]:
def recommend_songs(song_title, artist_name=None, n_recommendations=10):
    """
    Robust song recommendation function with comprehensive error handling
    """
    try:
        # Clean and validate inputs
        song_title = str(song_title).strip().lower()
        artist_name = str(artist_name).strip().lower() if artist_name else None
        
        # First try exact matching
        if artist_name:
            exact_mask = (songs_sample['song'].str.lower() == song_title) & \
                        (songs_sample['artist'].str.lower() == artist_name)
        else:
            exact_mask = (songs_sample['song'].str.lower() == song_title)
        
        exact_matches = songs_sample[exact_mask]
        
        # If no exact matches, try partial matching
        if len(exact_matches) == 0:
            if artist_name:
                partial_mask = (songs_sample['song'].str.lower().str.contains(song_title)) & \
                              (songs_sample['artist'].str.lower().str.contains(artist_name))
            else:
                partial_mask = songs_sample['song'].str.lower().str.contains(song_title)
            
            partial_matches = songs_sample[partial_mask]
            
            if len(partial_matches) == 0:
                sample = songs_sample.sample(min(5, len(songs_sample)))[['song', 'artist']]
                return f"Song not found. Try these: {sample.values.tolist()}"
            
            matching_songs = partial_matches
        else:
            matching_songs = exact_matches
        
        # Safely get the first match's index
        if len(matching_songs) == 0:
            return "No matching songs found"
        
        idx = matching_songs.index[0]
        
        # Get recommendations with bounds checking
        distances, indices = knn_model.kneighbors(tfidf_matrix[idx], n_neighbors=n_recommendations+1)
        
        # Ensure we don't exceed available indices
        valid_indices = [i for i in indices[0] if i < len(songs_sample)]
        if len(valid_indices) <= 1:  # Only contains the song itself
            return "No similar songs found in the sample"
        
        # Exclude the song itself and get top recommendations
        similar_indices = valid_indices[1:n_recommendations+1]
        recommendations = songs_sample.iloc[similar_indices]
        
        return recommendations[['song', 'artist', 'genre_label']] if not recommendations.empty else "No recommendations available"
    
    except Exception as e:
        return f"Error occurred: {str(e)}"

In [68]:
print(f"Total songs in sample: {len(songs_sample)}")
print("Sample songs:")
print(songs_sample[['song', 'artist']].sample(5))

Total songs in sample: 20000
Sample songs:
                                song          artist
44236               Independent Girl  Modern Talking
49035                  Computer Blue          Prince
7329                       Civil War   Guns N' Roses
33830                  Where Or When  George Michael
55966  Can't Lose What You Never Had        Westlife


In [69]:
print(recommend_songs("Can't Lose What You Never Had", "Westlife"))

                      song   artist    genre_label
23934  What Does It Matter  America  Party Anthems
31964    Treated Bad Again   Europe  Rap / Hip-Hop


In [70]:
print(recommend_songs("Civil War", "Guns N' Roses"))

                       song    artist       genre_label
40805               Play Me      Korn     Rap / Hip-Hop
12083              Hop Frog  Lou Reed  Rock / Rebellion
3282                  Moses  Coldplay     Party Anthems
26226  Nightflight To Venus  Boney M.  Rock / Rebellion


In [71]:
print(f"Total songs in sample: {len(songs_sample)}")
print("Sample songs:")
print(songs_sample[['song', 'artist']].sample(5))

Total songs in sample: 20000
Sample songs:
                    song              artist
43188        Cut By Wire          Mary Black
35808           All I Do            Hillsong
56843         Melancholy          X-Ray Spex
48675            So What                P!nk
27704  Let There Be Love  Christina Aguilera


In [73]:
print(recommend_songs("All I Do", "Hillsong"))

                                 song        artist       genre_label
48236                  Transformation  Phil Collins  Rock / Rebellion
48936  Nobody Loves Me (Like My Baby)    Pretenders     Soulful / R&B
51246                      The Garden          Rush  Rock / Rebellion


In [58]:
print(recommend_songs("Sea And Sky", "Dusty Springfield"))

                                song           artist         genre_label
38111                       Sapphire      John Martyn  Electronic / Dance
7840                   Free To Dance  Hillsong United       Party Anthems
51142               Pledging My Love      Roy Orbison          Love Songs
38144  Somewhere A Child Is Sleeping   John McDermott     Sad / Emotional
18159         It Must Have Been Love          Roxette          Love Songs
2395               Keep Your Head Up       Chaka Khan       Soulful / R&B
27919                Bon Soir Cherie      Chuck Berry       Party Anthems
27861     Percy, The Puny Poinsettia  Christmas Songs     Sad / Emotional
57537        Can't Leave Drank Alone             Z-Ro       Country Vibes
13853                   Dreamin' Man       Neil Young       Party Anthems
