In [1]:
# Import necessary libraries and modules
import pandas as pd  # for data manipulation
import nltk  # Natural Language Toolkit for text processing
from nltk.stem.porter import PorterStemmer  # for stemming words
from sklearn.feature_extraction.text import TfidfVectorizer  # to convert text data into TF-IDF feature vectors
from sklearn.metrics.pairwise import cosine_similarity  # to compute similarity between vectors
import pickle  # for saving and loading serialized models
import csv  # for reading and writing CSV files
from sklearn.model_selection import train_test_split  # to split data into training and testing sets

In [2]:
# Load the data into a pandas DataFrame
df = pd.read_csv('spotify_mil_song_dataset.csv')
df.head(5)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [3]:
# checks for nulls
df.isnull().sum()

artist    0
song      0
link      0
text      0
dtype: int64

In [4]:
# Randomly sample 5000 rows, remove 'link' column, and reset the index
df = df.sample(5000).drop('link', axis=1).reset_index(drop=True)
df.head(5)

Unnamed: 0,artist,song,text
0,Kanye West,Southside,"And everybody say, say \r\nI know you, I know..."
1,Kiss,Black Diamond,Out on the street for a living \r\nPicture's ...
2,Point Of Grace,Chalk In The Rain,"We want it all, we want it now \r\nWe want li..."
3,Kenny Loggins,She's Dangerous,She's dangerous yeah yeah yeah. \r\nShe's hot...
4,Pet Shop Boys,One In A Million,Why \r\nThis uncertainty? \r\nIt's not clear...


In [5]:
# Preprocess the 'text' column in the DataFrame
df['text'] = df['text'].str.lower().replace(r'^a-zA-Z0-9','').replace(r'\\n','',regex=True)  # Convert text to lowercase, remove special characters and newlines

In [6]:
df.head(5)

Unnamed: 0,artist,song,text
0,Kanye West,Southside,"and everybody say, say \r\ni know you, i know..."
1,Kiss,Black Diamond,out on the street for a living \r\npicture's ...
2,Point Of Grace,Chalk In The Rain,"we want it all, we want it now \r\nwe want li..."
3,Kenny Loggins,She's Dangerous,she's dangerous yeah yeah yeah. \r\nshe's hot...
4,Pet Shop Boys,One In A Million,why \r\nthis uncertainty? \r\nit's not clear...


In [7]:
# Initialize the Porter Stemmer for text processing
stemmer = PorterStemmer()

In [8]:
# Tokenizes & stems the input text & returns the processed text string with each word tokenized and stemmed
def token(x):
    token = nltk.word_tokenize(x) # Tokenize the input text into individual words
    y = [stemmer.stem(i) for i in token] # Stem each word in the tokenized list using the previously initialized PorterStemmer
    return " ".join(y) # Join the stemmed words back into a single string and return it

In [9]:
# Apply the tokenization and stemming function to the 'text' column of the DataFrame
df['text'] = df['text'].apply(lambda x: token(x))

In [10]:
# Initialize the TfidfVectorizer for converting text data into TF-IDF vectors
tfid = TfidfVectorizer(analyzer='word', stop_words='english')

In [11]:
# Generate the TF-IDF matrix for the text data
matrix = tfid.fit_transform(df['text'])

In [12]:
# Calculate the cosine similarity matrix from the TF-IDF matrix
similarity = cosine_similarity(matrix)

In [13]:
df[df['song']=='Southside'].index[0]

0

In [14]:
# Recommends similar songs based on the input song
def recommender(song):
    idx = df[df['song']==song].index[0] # Find the index of the given song in the DataFrame
    distance = sorted(list(enumerate(similarity[idx])), reverse=True, key=lambda x:x[1]) # Calculate similarity scores for the given song with all other songs
    song= [] # Initialize an empty list to store recommended songs

    # Loop over the top 4 similar songs (excluding the given song itself)
    for song_id in distance[1:5]: 
        song.append(df.iloc[song_id[0]].song) # Append the song title to the recommended songs list
    return song

In [15]:
recommender("She's Dangerous")

['Room Full Of Mirrors', 'If You Had My Love', 'Land Of 1,000 Dances', 'Yeah!']

In [16]:
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(df, open('df.pkl', 'wb'))