In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import re
import nltk #natural language toolkit, for tokenization

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
spotify_data = pd.read_csv('../data/spotify_millsongdata.csv')
spotify_data.head()

Unnamed: 0,artist,song,link,text
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \r\nA..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \r\nTouch me gen..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \r\nWhy I had...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...


In [36]:

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print(stop_words)


{"weren't", 'in', 'up', 'and', 'with', 'ain', 'why', 'now', "you'll", 'while', 'needn', "we'll", 'this', 'myself', 'my', "i'd", "you'd", 'them', "shan't", 'ma', 'on', 'whom', "you've", 'hadn', 'further', 'nor', 'our', 'so', "you're", 'haven', "he'll", 'their', "didn't", 'but', 'has', 'she', "they're", 'between', 'own', 'an', 'too', "won't", "we're", 'which', "he'd", 'such', 'can', 'were', "it'd", 'aren', 'or', "don't", 'down', 'mightn', 'through', 'the', 'will', 'where', 'that', 'ours', 'all', "wasn't", 'other', 'those', 'be', 'have', 'both', 'is', 'very', 'it', 'by', 'doesn', 'into', 'if', 'over', "i'm", 'we', "she's", 'until', 'out', 'wouldn', "we've", "wouldn't", "they've", 'hers', 'didn', 'as', "needn't", 't', 'wasn', 'there', 'himself', 'her', 'any', 'once', "i'll", "hadn't", 'than', 'couldn', 're', 'isn', 'weren', 'before', 'from', 'having', "mustn't", 'theirs', 'm', 'at', 'again', 'below', 'herself', "mightn't", "doesn't", 'am', "they'll", 'do', "hasn't", 'during', 'been', 'your

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/aakashshrestha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
#check for any null entries
spotify_data.isna().sum().sort_values(ascending=True)

artist            0
song              0
link              0
text              0
cleaned_lyrics    0
dtype: int64

In [37]:
def clean_text(text):
    text = str(text).lower() #lower case
    text = re.sub(r'\r\n', ' ', text) #remove new lines
    text = re.sub(r'[^a-z\s]', '', text) #remove special chars/numbers
    tokens = text.split() #tokenize
    tokens = [t for t in tokens if t not in stop_words] #remove unwanted stopwords
    return ' '.join(tokens)

In [38]:
#clean the spotify dataframe
spotify_data['cleaned_lyrics'] = spotify_data['text'].apply(clean_text) #apply clean_text function to clean the data
spotify_data[['song', 'cleaned_lyrics']].head(10)


Unnamed: 0,song,cleaned_lyrics
0,Ahe's My Kind Of Girl,look face wonderful face means something speci...
1,"Andante, Andante",take easy please touch gently like summer even...
2,As Good As New,ill never know go put lousy rotten show boy to...
3,Bang,making somebody happy question give take learn...
4,Bang-A-Boomerang,making somebody happy question give take learn...
5,Burning My Bridges,well hoot holler make mad ive always heel holy...
6,Cassandra,street theyre singing shouting staying alive t...
7,Chiquitita,chiquitita tell whats wrong youre enchained so...
8,Crazy World,morning sun couldnt sleep thought id take walk...
9,Crying Over You,im waitin baby im sitting alone feel cold with...



TF-IDF
Tf : Term Frequency - how often does the word appears in the document
IDF : Inverse Document Frequency - how rare that word is across all songs
<br>
$TF-IDF = TF * IDF$ => It gives high weight to words that are important to one song but not common in all song

In [46]:
#TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=1000)
X_tfidf = vectorizer.fit_transform(spotify_data['cleaned_lyrics'])
X_tfidf.shape
print(X_tfidf.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.05939632 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.07460932 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [47]:
#KMeans clustering
kmeans = KMeans(n_clusters=5, random_state=7)
kmeans.fit(X_tfidf)
spotify_data['cluster'] = kmeans.predict(X_tfidf)

#view the clusters
spotify_data[['artist', 'song', 'cluster']].head(10)


Unnamed: 0,artist,song,cluster
0,ABBA,Ahe's My Kind Of Girl,3
1,ABBA,"Andante, Andante",0
2,ABBA,As Good As New,4
3,ABBA,Bang,4
4,ABBA,Bang-A-Boomerang,4
5,ABBA,Burning My Bridges,1
6,ABBA,Cassandra,3
7,ABBA,Chiquitita,3
8,ABBA,Crazy World,0
9,ABBA,Crying Over You,1


In [None]:
#Cosine Similarity-based recommendation