In [None]:
import spotipy
import spotipy.util as util
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from yellowbrick.cluster import KElbowVisualizer
from kneed import KneeLocator

tqdm.pandas()

## DDR Project

Spotify Developer Credentials and API call to establish an authorised connection for data loading and transfer

In [None]:
config = {'username':"<user name>",'scope':'playlist-read-private',
          'client_id':"<id>",'client_secret':"<>",
          'redirect_uri':'http://localhost:8080/callback'}

token = util.prompt_for_user_token(config['username'],
                                   scope=config['scope'],
                                   client_id = config['client_id'],
                                   client_secret = config['client_secret'],
                                   redirect_uri = config['redirect_uri'])
token_write = util.prompt_for_user_token(config['username'],
                                   scope= "playlist-modify-public",
                                   client_id = config['client_id'],
                                   client_secret = config['client_secret'],
                                   redirect_uri = config['redirect_uri'])


sp = spotipy.Spotify(auth=token)
sp_write = spotipy.Spotify(auth=token_write)

Using the function below, we make the API call to extract data of a every song in the specified playlist.

In [None]:
def api_call(username, playlist_uri,limit=0,songs=[]):
    _pid = playlist_uri.split(":")[2] # extract the playlist id from the uri
    _pname = sp.user_playlist(username,_pid)['name'] # extract name of the playlist by making the API call
    
    while 1:
        results = sp.user_playlist_tracks(username, _pid, offset=limit) # start extracting the details of all the songs in the playlist
        songs += results['items']
        if results['next'] is not None:
            limit = limit + 100 # Max limit to load is 100 songs at a time, so recursively do this until all songs are loaded
        else:
            break
    
    names = [song['track']['name'] for song in songs] # extracting the metadata (name, artist name and song uri)
    artists = [song['track']['artists'][0]['name'] for song in songs]
    uris = [song['track']['uri'] for song in songs]
    
    return _pname, names, artists, uris

username = "<user name>"
playlist_uri = "<playlist uri>"

playlist, names, artists, uris = api_call(username, playlist_uri)

Saving the API returned data into a dataframe before we write it into Mongo DB

In [None]:
df = pd.DataFrame(list(zip(names, artists, uris)), columns = ['Name','Artist','URI'])

df.head(10)

Each song is described by Spotify on 9 parameters, namely: 
1. Danceability
2. Energy
3. Loudness
4. Speechiness
5. Acousticness
6. Instrumentalness
7. Liveness
8. Valence
9. Tempo

So, we try and extract these features of each and every song in our playlist. Later, we use these features to cluster these songs

In [None]:
reference_dict = sp.audio_features(uris[0])[0] # reference dictionary to keep track of all the available variables for each song

def feature_extraction(row):
    for key,_ in reference_dict.items():
        row[key] = sp.audio_features(row['URI'])[0][key]
    return row

df = df.progress_apply(feature_extraction, axis = 1)
df

Now we save these files into Mongo DB (local)

In [None]:
from pymongo import MongoClient

url = "mongodb://localhost:27017"
db = "spotify"
collection = "my_playlist"
client = MongoClient(host=url)
database = client[db]
collection = database[collection]

collection.insert_many(df.to_dict("records"))

There are few irrelevant features in our data like the URI, type, id, track_href and analysis url which do not play any role in clustering the similar songs. So we drop those features as a part of our data preprocessing and push the updated dataframe into a new collection on Mongo DB

In [None]:
collection = database['processed_data']

df.drop(['type','id','uri','track_href','analysis_url','key','mode'],axis=1,inplace=True)

collection.insert_many(df.to_dict("records"))

In [None]:
# Indexing on the rank/serial number in the mongo

collection.create_index("duration_ms")

## Machine Learning Project

### Principal Component Analysis

Next we perform PCA to compress the available 9 features set into a fewer dimensional space. Before we perform PCA, we scale the data to standardize it. If this step is avoided, the clustering algorithm might assume extra weight to features like loudness and tempo as they vary by much more.

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(df.iloc[:,3:-2]) # # excluding the string variables and other irrelevant columns from scaling

In [None]:
pca = PCA()
pca.fit(X)
# The attribute shows how much variance is explained by each of the nine features
variance_explain = pca.explained_variance_ratio_
variance_explain

In [None]:
fig = plt.figure(figsize=(10,8))
plt.plot(range(1, len(df.iloc[:,3:-2].columns)+1), variance_explain.cumsum(), marker='o', linestyle='-')
plt.xlabel('Number of Components', fontsize=18)
plt.ylabel('Cumulative Explained Variance',fontsize=18)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.show()

In [None]:
# We now try to find the optimal number of principal components to consider which would explain at least 80% of variation

for i, exp_var in enumerate(variance_explain.cumsum()):
    if exp_var >= 0.8:
        n_vectors = i + 1
        break
print("Number of components:", n_vectors)
pca = PCA(n_components=n_vectors)
pca.fit(X)
scores_pca = pca.transform(X)

In [None]:
pca_df = pd.DataFrame({'danceability':-1*pca.components_[:,0],'energy':-1*pca.components_[:,1],
             'loudness':-1*pca.components_[:,2],'specchiness':-1*pca.components_[:,3],
             'acousticness':-1*pca.components_[:,4],'instrumentalness':-1*pca.components_[:,5],
             'liveness':-1*pca.components_[:,6],'valence':-1*pca.components_[:,7],'tempo':-1*pca.components_[:,8]}).T
pca_df[pca_df < 0.15] = None
pca_df

So we narrow down on 6 principal components and the PCs can be renamed as follows:      
      
Principal Component 1: Contains Energy, Loudness (abb: EL)      
Principal Component 2: Contains Danceability, Valence (abb: DV)        
Principal Component 3: Contains Dancebility and Instrumentalness (abb: DI)       
Principal Component 4: Contains Instrumentalness and Tempo (abb: IT)          
Principal Component 5: Contains Acousticness (abb: A)          
Principal Component 6: Contains Danceability and Tempo (abb: DT)     

In [None]:
visualizer = KElbowVisualizer(KMeans(init='k-means++', random_state=42), k=(1,21), timings=False)
visualizer.fit(scores_pca)
visualizer.show()
n_clusters = visualizer.elbow_value_
print("Optimal number of clusters:", n_clusters)

In [None]:
kmeans_ = KMeans(n_clusters=n_clusters, init='k-means++', random_state=42)
kmeans_.fit(X);

In [None]:
features_df = df.iloc[:,2:-2]
df_kmpca = pd.concat([features_df.reset_index(drop=True), pd.DataFrame(scores_pca)], axis=1)
df_kmpca.columns.values[(-1*n_vectors):] = ["Component " + str(i+1) for i in range(n_vectors)]
df_kmpca['Cluster'] = kmeans_.labels_
df_kmpca.rename(columns= {'Component 1':'EL','Component 2':'DV','Component 3':'DI',
                                  'Component 4':'IT','Component 5':'A','Component 6':'DT'},inplace=True)
df_kmpca

In [None]:
df['Cluster'] = df_kmpca['Cluster']
df

In [None]:
#'Component 1':'EL','Component 2':'DV','Component 3':'DI',
#                                 'Component 4':'IT','Component 5':'A','Component 6':'DT'

df['Cluster'] = df_kmpca['Cluster']
df['EL'] = df_kmpca['EL']
df['DV']= df_kmpca['DV']
x = df['EL']
y = df['DV']
n = df['Cluster']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'goldenrod', 'tab:cyan'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Energy/Loudness", fontsize=18)
plt.ylabel("Danceability/Valence", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
for i, txt in enumerate(n):
    plt.text(x[i], y[i],txt)
plt.show()

In [None]:
#'Component 1':'EL','Component 2':'DV','Component 3':'DI',
#                                 'Component 4':'IT','Component 5':'A','Component 6':'DT'

df['Cluster'] = df_kmpca['Cluster']
df['A'] = df_kmpca['A']
df['DT']= df_kmpca['DT']
x = df['A']
y = df['DT']
n = df['Cluster']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'goldenrod', 'tab:cyan'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Acousticness", fontsize=18)
plt.ylabel("Danceability/Tempo", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
for i, txt in enumerate(n):
    plt.text(x[i], y[i],txt)
plt.show()

In [None]:
#'Component 1':'EL','Component 2':'DV','Component 3':'DI',
#                                 'Component 4':'IT','Component 5':'A','Component 6':'DT'

df['Cluster'] = df_kmpca['Cluster']
df['A'] = df_kmpca['A']
df['IT']= df_kmpca['IT']
x = df['A']
y = df['IT']
n = df['Cluster']
fig = plt.figure(figsize=(10, 8))
sns.scatterplot(x, y, hue=df['Cluster'], palette = ['tab:blue', 'tab:orange', 'tab:green', 'tab:red', 'tab:purple', 'goldenrod', 'tab:cyan'])
plt.title('Clusters by PCA Components', fontsize=20)
plt.xlabel("Acousticness", fontsize=18)
plt.ylabel("Instrumental/Tempo", fontsize=18)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
for i, txt in enumerate(n):
    plt.text(x[i], y[i],txt)
plt.show()

In [None]:
clusters = {0:"Cluster 0 - Party", 1:"Cluster 1 - SAD", 2:"Cluster 2 - STUDY", 3:"Cluster 3 - GYM", 4:"Cluster 4 - TRAVEL", 
            5:"Cluster 5 - HAPPY", 6:"Cluster 6 - OTHER"}

for key,value in clusters.items():
    temp = sp_write.user_playlist_create(config['username'], value, public=True, collaborative=False, description='')
    _pid = temp['id']
    songs = list(df.loc[df['Cluster'] == key]['URI'])
    if len(songs) > 100:
        sp_write.playlist_add_items(_pid, songs[:100])
        sp_write.playlist_add_items(_pid, songs[100:])
    else:
        sp_write.playlist_add_items(_pid, songs)

In [None]:
df.to_csv("data.csv")