In [None]:
# !pip install spotipy

In [None]:
# !conda install -c conda-forge umap-learn

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth
import os
import re
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
import umap.umap_ as umap
import altair as alt

In [None]:
# np.__version__

In [None]:
# pd.__version__

In [None]:
# Documentation:
# Client credentials:
#   https://developer.spotify.com/documentation/general/guides/app-settings/#register-your-app
#
# What each audio features means:
#   https://developer.spotify.com/documentation/web-api/reference/#object-audiofeaturesobject
#   https://medium.com/@FinchMF/praise-questions-and-critique-spotify-api-38e984a4174b
# LUFS (Loudness):
#   https://www.sweetwater.com/insync/what-is-lufs-and-why-should-i-care/
# Scopes:
#   https://developer.spotify.com/documentation/general/guides/scopes/#user-library-read


# Examples:
#   https://www.researchgate.net/profile/Scott-Wolf/publication/342854806_Just_the_Way_You_Are_Linking_Music_Listening_on_Spotify_and_Personality/links/5f0bc62a92851c52d62f8a3a/Just-the-Way-You-Are-Linking-Music-Listening-on-Spotify-and-Personality.pdf
#   https://www.diva-portal.org/smash/get/diva2:1108465/FULLTEXT02
#   https://ieeexplore.ieee.org/abstract/document/7987199
#   https://www.math.vu.nl/~sbhulai/papers/paper-vandenhoven.pdf
#   https://towardsdatascience.com/discovering-your-music-taste-with-python-and-spotify-api-b51b0d2744d
#   https://towardsdatascience.com/a-music-taste-analysis-using-spotify-api-and-python-e52d186db5fc
#   https://rareloot.medium.com/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330

# Cluster
# https://martinfleischmann.net/clustergam-visualisation-of-cluster-analysis/

In [None]:
# Spotipy documentation to get own credentials https://spotipy.readthedocs.io/en/2.18.0/
os.environ["SPOTIPY_CLIENT_ID"] = #Need to get your own credentials from Spotify
os.environ["SPOTIPY_CLIENT_SECRET"] = #Need to get your own credentials from Spotify 
os.environ['SPOTIPY_REDIRECT_URI'] = "http://localhost:8080"   # Needed for user authorization

In [None]:
  # acousticness: 0.0 to 1.0,   1 is acoustic

  # danceability: 0.0 to 1.0,   1 is most danceable

  # energy: 0.0 to 1.0, perceptual measure of intensity and activity. 1.0 energetic tracks feel fast, loud and noisy. 
  #     Perceptual features contributing to this attribute include dynamic range, perceived loudness, timbre, onset rate, and general entropy.  
  
  # instrumentalness: 0.0 to 1.0, predicts whether a track contains no vocals (is instrumental).  1.0 is pure intrumental. Above 0.5 represent instrumental tracks.  
  
  # key:  Integers map to pitches using standard Pitch Class notation . E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on.    I still don't understand this :/

  # liveness: 0.0 to 1.0, presence of audience in the recording. 0.8 or above strong likelihood that the track is live.

  # loudness: -60.0 to 0.0db,  Negative number, closer to 0 is louder.

  # mode: 0 or 1, Minor=0 Major=1, Major is happier

  # speechiness: 0.0 to 1.0, detects presence of spoken words. 1.0 is talk show or audio book. 
  #     Values above 0.66 describe tracks that are probably made entirely of spoken words. 
  #     Values between 0.33 and 0.66 describe tracks that may contain both music and speech, either in sections or layered, including such cases as rap music. 
  #     Values below 0.33 most likely represent music and other non-speech-like tracks.

  # tempo: BPM, Beats per minute 
  
  # valence: 0.0 to 1.0, describing musical positiveness, the higher the more positive. 
  
  # analysis_url: 
  # duration_ms: 238854,
  # id: '1eT2CjXwFXNx6oY5ydvzKU',
  # time_signature: 4,
  # track_href: 'https://api.spotify.com/v1/tracks/1eT2CjXwFXNx6oY5ydvzKU',
  # type: 'audio_features',
  # uri: 'spotify:track:1eT2CjXwFXNx6oY5ydvzKU',

In [None]:
# Example code without user authorization
urn = 'spotify:artist:3jOstUTkEu2JkjvRdBA5Gu'
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())
sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials())

artist = sp.artist(urn)

user = sp.user('plamere')

track = sp.track('2nLtzopw4rPReszdYBJU6h')

audio_features = sp.audio_features('1eT2CjXwFXNx6oY5ydvzKU')

In [None]:
# Get access with user authorization
scope = 'user-library-read user-follow-read'
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))

In [None]:
# # Quick display of all saved songs
# def show_tracks(results):
#     for item in results['items']:
#         track = item['track']
#         print("%32.32s %s" % (track['artists'][0]['name'], track['name']))
        
# results = sp.current_user_saved_tracks()
# show_tracks(results)

# while results['next']:
#     results = sp.next(results)
#     show_tracks(results)

In [None]:
# Get all tracks and save them to lists to construct dataframe
tracks = sp.current_user_saved_tracks()

song = []
artist = []
uri = []

#First pass of the saved tracks
for i in tracks['items']:
    song.append(i['track']['name'])
    artist.append(i['track']['artists'][0]['name'])
    uri_id = i['track']['uri']
    uri.append(re.search('(^spotify:track:([^\s]+))', uri_id).groups()[1])

# Keep looking in tracks for the rest of the saved tracks, they are paginated
while tracks['next']:
    tracks = sp.next(tracks)
    for i in tracks['items']:
        song.append(i['track']['name'])
        artist.append(i['track']['artists'][0]['name'])
        uri_id = i['track']['uri']
        uri.append(re.search('(^spotify:track:([^\s]+))', uri_id).groups()[1])

In [None]:
# Create a dataframe from info extracted above, song, artist and uri
songs_df = pd.DataFrame(
    {'song': song,
     'artist': artist,
     'uri': uri
    })

In [None]:
songs_df

In [None]:
pd.DataFrame(sp.audio_features(i)[0], index=[i,])

In [None]:
# Extract audio features from each song and create a dataframe
appended_df = []
for i in uri:
    appended_df.append(pd.DataFrame(sp.audio_features(i)[0], index=[i,]))
uri_df = pd.concat(appended_df)
uri_df = uri_df.drop(['uri'], axis=1)
uri_df = uri_df.reset_index()
uri_df = uri_df.rename(columns={"index": "uri"})

In [None]:
saved_songs_df = pd.merge(songs_df, uri_df, on="uri")

In [None]:
saved_songs_df.head()

In [None]:
saved_songs_df.describe()

In [None]:
radar_df = saved_songs_df[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
                           'instrumentalness', 'liveness', 'valence']]

In [None]:
radar_df.describe()

In [None]:
min_max_scaler = MinMaxScaler()
radar_df['loudness'] = min_max_scaler.fit_transform(radar_df['loudness'].to_numpy().reshape(-1, 1))
radar_df.describe()

In [None]:
# convert column names into a list
categories=list(radar_df.columns)
# number of categories
N=len(categories)

# create a list with the average of all features
value=list(radar_df.mean())

In [None]:
fig = px.line_polar(radar_df, r=value, theta=categories, line_close=True, range_r=[0,1])
fig.update_traces(fill='toself')
fig.show()

In [None]:
saved_songs_df.head()

In [None]:
pd.plotting.scatter_matrix(saved_songs_df, alpha=0.2, figsize=(50,50))


In [None]:
# Columns of interest
saved_songs_df[['uri', 'danceability','energy','key','loudness','mode','speechiness','acousticness',
                'instrumentalness','liveness','valence','tempo', 'duration_ms','time_signature']].head()

In [None]:
# Creating the df with columns of interest
feature_names = ['danceability','energy','key','loudness','mode','speechiness','acousticness',
                'instrumentalness','liveness','valence','tempo', 'duration_ms','time_signature']
df = saved_songs_df[feature_names]
df.head()

In [None]:
# Not sure if I want to keep because new songs might have different scale.
StdScaler = StandardScaler().fit(df).transform(df)

In [None]:
# Running PCA
pca = PCA(n_components = 3, random_state = 0).fit(StdScaler)

In [None]:
# Calculating variance for each principal component
singularValues = pca.singular_values_
variance = np.square(singularValues)/(len(StdScaler)-1)
variance

In [None]:
def plot_pca_heatmap(pca, feature_names):
    pc_names = ['First PC', 'Second PC', 'Third PC']
    fig, ax = plt.subplots(figsize=(len(feature_names), 3))
    plt.imshow(pca.components_, interpolation = 'none', cmap = 'plasma')
    feature_names=list(feature_names)
    plt.xticks(np.arange(0, len(feature_names), 1) , feature_names[:], rotation = 75, fontsize=16)
    plt.yticks(np.arange(0.0, 3, 1), pc_names, fontsize = 16)
    plt.colorbar()



plot_pca_heatmap(pca, feature_names)

In [None]:
def biplot(score, coeff, maxdim, pcax, pcay, labels=None):
    pca1=pcax-1
    pca2=pcay-1
    xs = score[:,pca1]
    ys = score[:,pca2]
    n = min(coeff.shape[0], maxdim)
    scalex = 2.0/(xs.max()- xs.min())
    scaley = 2.0/(ys.max()- ys.min())
    text_scale_factor = 1.5
    plt.scatter(xs*scalex, ys*scaley, s=1)
    for i in range(n):
        plt.arrow(0, 0, coeff[i,pca1], coeff[i,pca2],color='r',alpha=0.5) 
        if labels is None:
            plt.text(coeff[i,pca1]* text_scale_factor, coeff[i,pca2] * text_scale_factor, "Var"+str(i+1), color='g', ha='center', va='center')
        else:
            plt.text(coeff[i,pca1]* text_scale_factor, coeff[i,pca2], labels[i], color='g', ha='center', va='center')
    plt.xlim(-1,1)
    plt.ylim(-1,1)
    plt.xlabel("PC{}".format(pcax))
    plt.ylabel("PC{}".format(pcay))
    plt.grid()

plt.figure()
plt.figure(figsize=(10, 10))

# Uncomment this code to generate the biplot using your previous answers
X_normalized = StdScaler
X_pca = pca.transform(X_normalized)
biplot(X_pca, np.transpose(pca.components_[0:2, :]), len(feature_names), 1, 2, labels=feature_names[:])

In [None]:
X_pca = PCA(n_components = 2).fit_transform(StdScaler)
X_tsne = TSNE(n_components = 2, random_state = 42).fit_transform(StdScaler)
X_mds = MDS(n_components = 2, random_state = 42).fit_transform(StdScaler)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 8))


axes[0, 0].scatter(StdScaler[:, 0], StdScaler[:, 1], s=10)
axes[0, 1].scatter(X_pca[:, 0], X_pca[:, 1], s=10)
axes[1, 0].scatter(X_tsne[:, 0], X_tsne[:, 1], s=10)
axes[1, 1].scatter(X_mds[:, 0], X_mds[:, 1], s=10)

axes[0, 0].set_title("Original")
axes[0, 1].set_title("PCA")
axes[1, 0].set_title("t-SNE")
axes[1, 1].set_title("MDS")
del fig, axes

In [None]:
# get index from songs given X and Y coordinates on t-SNE
songs_index = (np.where((X_tsne[:,0] <= -20) & (X_tsne[:,1] >= 15)))
saved_songs_df.iloc[songs_index]

In [None]:
kmeans = KMeans(n_clusters=6, random_state=0).fit(X_tsne)
cluster = pd.DataFrame(X_tsne,columns=['x','y'])
cluster['cluster'] = kmeans.labels_

alt.Chart(cluster).mark_point().encode(
    x='x',
    y='y',
    color='cluster:N'
)

#Kmeans might not be the best here due to different size in clusters