In [1]:
import pandas as pd

file_path = 'C:\\Users\\vardh\\.vscode\\Spotify-Music-Recommendation-System-using-KNN\\spotify_millsongdata.csv'
data = pd.read_csv(file_path)

data.head(), data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   link    57650 non-null  object
 3   text    57650 non-null  object
dtypes: object(4)
memory usage: 1.8+ MB


(  artist                   song                                        link  \
 0   ABBA  Ahe's My Kind Of Girl  /a/abba/ahes+my+kind+of+girl_20598417.html   
 1   ABBA       Andante, Andante       /a/abba/andante+andante_20002708.html   
 2   ABBA         As Good As New        /a/abba/as+good+as+new_20003033.html   
 3   ABBA                   Bang                  /a/abba/bang_20598415.html   
 4   ABBA       Bang-A-Boomerang      /a/abba/bang+a+boomerang_20002668.html   
 
                                                 text  
 0  Look at her face, it's a wonderful face  \r\nA...  
 1  Take it easy with me, please  \r\nTouch me gen...  
 2  I'll never know why I had to go  \r\nWhy I had...  
 3  Making somebody happy is a question of give an...  
 4  Making somebody happy is a question of give an...  ,
 None)

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Preprocessing
def clean_text(text):
    text = re.sub(r'\r\n', ' ', text)   # Removing Line Breaks
    text = re.sub(r'[^a-zA-Z\s]', '', text)     # Remove non-alphabetic characters
    text = text.lower()     # Convert to lowercase
    return text

data['cleaned_text'] = data['text'].apply(clean_text)

# Feature extraction using TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features= 5000, stop_words='english')   # Limit to 5000 features
tfidf_matrix = tfidf.fit_transform(data['cleaned_text'])

# Display the shape of the TF-IDF matrix
tfidf_matrix.shape

(57650, 5000)

In [None]:
from sklearn.neighbors import NearestNeighbors

# Fiting the KNN model
knn = NearestNeighbors(n_neighbors= 5, metric='cosine') # Using cosine similarity
knn.fit(tfidf_matrix)

# Recommendation Function
def recommend_songs(song_title, n_recommendations= 5):
    song_index = data[data['song'].str.lower() == song_title.lower()].index     # Find the index of the song
    if len(song_index) == 0:
        return f"Song '{song_title}' not found in the dataset."
    
    # Query the KNN model
    distances, indices = knn.kneighbors(tfidf_matrix[song_index[0]], n_neighbors=n_recommendations + 1)
    
    # Fetch the recommended songs
    recommended_songs = []
    for idx in indices.flatten()[1:]:
        recommended_songs.append({
            "song": data.iloc[idx]['song'],
            "artist": data.iloc[idx]['artist'],
            "distance": distances.flatten()[indices.tolist()[0].index(idx)]
        })
        
    return recommended_songs

# Example: Get recommendations for a song
# recommend_songs("Bang-A-Boomerang")

[{'song': 'Bang',
  'artist': 'ABBA',
  'distance': np.float64(0.010192173513129732)},
 {'song': 'Bang Bang',
  'artist': 'Stevie Wonder',
  'distance': np.float64(0.29816408613252365)},
 {'song': 'Bang-Bang',
  'artist': 'Cher',
  'distance': np.float64(0.3177692391600685)},
 {'song': 'Bang Bang',
  'artist': 'David Bowie',
  'distance': np.float64(0.31859633542433285)},
 {'song': 'Bang Bang (My Baby Shot Me Down)',
  'artist': 'Frank Sinatra',
  'distance': np.float64(0.32060721704310413)}]