In [1]:
%reset -f

# **Import Libraries**

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

#**Mounting the Drive**
**To read the file (Dataset)**

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
path = '/content/drive/My Drive/Datasets/spotify_millsongdata.csv' #dataset file path

#**Data Collection**

In [5]:
df=pd.read_csv(path)

In [6]:
df.head(3)

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...


In [7]:
df.tail(5)

Unnamed: 0,artist,song,link,text
57645,Ziggy Marley,Good Old Days,/z/ziggy+marley/good+old+days_10198588.html,Irie days come on play \r\nLet the angels fly...
57646,Ziggy Marley,Hand To Mouth,/z/ziggy+marley/hand+to+mouth_20531167.html,Power to the workers \r\nMore power \r\nPowe...
57647,Zwan,Come With Me,/z/zwan/come+with+me_20148981.html,all you need \r\nis something i'll believe \...
57648,Zwan,Desire,/z/zwan/desire_20148986.html,northern star \r\nam i frightened \r\nwhere ...
57649,Zwan,Heartsong,/z/zwan/heartsong_20148991.html,come in \r\nmake yourself at home \r\ni'm a ...


**A Record of `text` feature**

In [8]:
df.loc[1000,'text']

"Here we are, seven days  \r\nAnd seven nights of empty tries  \r\nIt's ritual, habitual  \r\nBut it's never gonna work this time  \r\n  \r\nWe're to the point of no return  \r\nAnd along the way the only thing we've learned  \r\nIs how to hurt each other  \r\n  \r\nI'm looking back and wondering why  \r\nIt took so long to realize  \r\nThat nothing's changed, it never will  \r\nAll these years of standing still  \r\nAnd still we stay in all this pain  \r\nAnd nothing's gonna make it go away  \r\n  \r\nI don't wanna wait another minute  \r\nPut me out of my misery  \r\nI can read your mind baby you're not in it  \r\nAnd we're not what we used to be  \r\nNo you wouldn't have to lie to me  \r\nIf you would only let me go  \r\nAnd I don't wanna wait another minute to hear  \r\nSomething that I already know  \r\nI know, I know, I know  \r\nSomething that I already know  \r\nI know, I know, I know  \r\n  \r\nSo save your voice  \r\nDon't waste your breath  \r\nCan't you see we're at the end

**A Record of `link` feature**

In [9]:
df.loc[1000,'link']

'/b/backstreet+boys/something+that+i+already+know_20658413.html'

**A Record of `song` feature**

In [10]:
df.loc[1000,'song']

'Something That I Already Know'

**A Record of `artist` feature**

In [11]:
df.loc[1000,'artist']

'Backstreet Boys'

**Hence we have:**

1. `text`: This feature contains the lyrics of the songs.
2. `link`: The `link` feature provides URLs or links to access the songs.
3. `song`: The `song` feature contains the name of the songs.
4. `artist`: The `artist` feature contains the name of the artist who performed the songs."

# **Data Preprocessing**

## **Removing Unnecessary Column**

**We can safely drop the `link` feature since it is not needed for building our music recommendation model.**

In [12]:
df.drop('link', axis=1, inplace=True)

## **Counting Duplicates**

In [13]:
print(f'The Number of Duplicate Records Are: {df.duplicated().sum()}')

The Number of Duplicate Records Are: 0


## **Data Shape**

**Total Instances:** `57650` <br>
**with zero duplicate record, and** `04` **Features**

In [14]:
df.shape

(57650, 3)

## **Features Overview**

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57650 entries, 0 to 57649
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   artist  57650 non-null  object
 1   song    57650 non-null  object
 2   text    57650 non-null  object
dtypes: object(3)
memory usage: 1.3+ MB


**Basic Statistics**

In [16]:
df.describe()

Unnamed: 0,artist,song,text
count,57650,57650,57650
unique,643,44824,57494
top,Donna Summer,Have Yourself A Merry Little Christmas,I just came back from a lovely trip along the ...
freq,191,35,6


**`count` suggest there is no missing value.**

## **Misiing Values Count**

In [17]:
#Verify in alternative way:
df.isnull().sum()

artist    0
song      0
text      0
dtype: int64

#**Training Limitation and Data Sampling**

**I attempted to train the model on the entire dataset using Google Colab GPU, but the process caused a RAM overload as the dataset exceeded the available memory capacity. Due to the limitations of available resources, we will proceed by working with a subset of `20,000` randomly selected samples to build the model.**

In [18]:
new_instances = 20000

In [19]:
df = df.sample(new_instances, random_state=42).reset_index(drop=True)

In [20]:
df.shape

(20000, 3)

In [21]:
df.head(3)

Unnamed: 0,artist,song,text
0,Wishbone Ash,Right Or Wrong,Like to have you 'round \r\nWith all the lies...
1,Aerosmith,This Little Light Of Mine,"This Little Light of Mine (Light of Mine), \r..."
2,Fall Out Boy,"Dance, Dance",She says she's no good with words but I'm wors...


**New Shape**<br>
**Total Instances:** `20,000` <br>
**with zero duplicate record, and** `03` **Features**

## **Data Cleaning**

In [22]:
df['text'][0]

"Like to have you 'round  \r\nWith all the lies that you make  \r\nThe things or darkness and you  \r\nSome people say, have just a taste  \r\nRight or wrong, you might get burned  \r\nWhat you gain is what you learn  \r\n  \r\nGot one too many women  \r\nDon't know quite which way to go  \r\nThey're all gettin' so expensive  \r\nWhen they walk by themselves  \r\nRight or wrong, don't regret  \r\nWhat you went for is what you get  \r\n  \r\nNo point in bitter tears  \r\nWhen someone else has cut you down  \r\n'Cause there's a time for leavin'  \r\nAnd there's a time for stickin' around, hey  \r\nRight or wrong, you've got to live  \r\nSo what you collect is what you give\r\n\r\n"

In [23]:
import re

In [24]:
def clean_text(text):
    # Remove extra whitespaces and newline characters
    cleaned_text = re.sub(r'\s+', ' ', text).strip()

    # Convert to lowercase
    cleaned_text = cleaned_text.lower()

    # Remove punctuation and special characters
    cleaned_text = re.sub(r'[^a-z0-9\s]', '', cleaned_text)

    return cleaned_text

In [25]:
df['text'] = df['text'].apply(clean_text)

In [26]:
df['text'][0]

'like to have you round with all the lies that you make the things or darkness and you some people say have just a taste right or wrong you might get burned what you gain is what you learn got one too many women dont know quite which way to go theyre all gettin so expensive when they walk by themselves right or wrong dont regret what you went for is what you get no point in bitter tears when someone else has cut you down cause theres a time for leavin and theres a time for stickin around hey right or wrong youve got to live so what you collect is what you give'

## **Text Tokenization and Stemming**

In [27]:
from nltk.stem.porter import PorterStemmer

In [28]:
stemmer = PorterStemmer()

In [29]:
import nltk

In [30]:
def tokenization(txt):
    tokens = nltk.word_tokenize(txt)
    stemming = [stemmer.stem(w) for w in tokens]
    return " ".join(stemming)

In [31]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
df['text'] = df['text'].apply(lambda txt: tokenization(txt))

## **TF-IDF Vectorization and Cosine Similarity**

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
tfidf_vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
tfidf_features = tfidf_vectorizer.fit_transform(df['text'])

In [35]:
from sklearn.metrics.pairwise import cosine_similarity

In [36]:
similarity_matrix = cosine_similarity(tfidf_features)

In [37]:
similarity_matrix.shape

(20000, 20000)

In [38]:
df[df['song'] == 'I Can See It In Your Eyes']

Unnamed: 0,artist,song,text
7558,Journey,I Can See It In Your Eyes,babi too bad i use to love you girl i dont car...
13406,Randy Travis,I Can See It In Your Eyes,the first time i saw you i felt weak insid i s...


## **Recommendation Function**

In [39]:
desired_recommendations = 10 # The desired number of recommendations

In [40]:
def recommend_similar_songs(target_song, similarity_matrix=similarity_matrix, data=df, num_recommendations=desired_recommendations):
    # Find the index of the target song
    target_song_idx = data[data['song'] == target_song].index[0]

    # Get similarity scores for the target song
    song_similarity_scores = list(enumerate(similarity_matrix[target_song_idx]))

    # Sort songs by similarity in descending order
    sorted_songs = sorted(song_similarity_scores, key=lambda x: x[1], reverse=True)

    # Extract and return the top N similar songs (excluding the input song)
    recommended_songs = [data.iloc[idx]['song'] for idx, _ in sorted_songs[1:num_recommendations+1]]

    return recommended_songs


##**Usage/Demo**

**Let's randomly select a song from the available list and view the recommendations list generated by the model.**

In [58]:
import random

In [59]:
song = random.choice(df.song)

In [60]:
print(f'The Randomly Selected Song is: "{song}"')

The Randomly Selected Song is: "Mourning Glory"


In [61]:
#Get the recommendations by the model
recommended_songs = recommend_similar_songs(song)

**Display the Recommendations**

In [62]:
for song_number, song in enumerate(recommended_songs, start=1):
    print(f'Recommended Song-{song_number}: {song}\n')

Recommended Song-1: Roses Are Free

Recommended Song-2: Dead Pumpkins

Recommended Song-3: Pumpkin Head

Recommended Song-4: A Little You

Recommended Song-5: Over The River And Through The Woods

Recommended Song-6: Into The Woods - Moments In The Woods

Recommended Song-7: If I Could

Recommended Song-8: Little By Little

Recommended Song-9: Big Fat Love

Recommended Song-10: One Little Christmas Tree

