This file shows all the ways that our cluster numbers were analysed 

In [None]:
import os
import json
import pandas as pd
import numpy as np
from tqdm import tqdm
import spotipy
import sqlite3
from sklearn.cluster import KMeans
from sqlite3 import Error
import matplotlib.pyplot as plt
from spotipy.oauth2 import SpotifyClientCredentials
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from openTSNE import TSNE as openTSNE
from sklearn.metrics import silhouette_score, silhouette_samples
from wordcloud import WordCloud
from wordcloud import STOPWORDS as STOPWORDS
from sklearn.decomposition import PCA
from yellowbrick.cluster import SilhouetteVisualizer
import seaborn as sns

In [None]:
def create_tables(conn):
    try:
        cur = conn.cursor()
        # create playlist table
        cur.execute("""CREATE TABLE IF NOT EXISTS playlists (
                                    name text NOT NULL,
                                    collaborative text,
                                    pid integer NOT NULL primary key,
                                    modified_at integer,
                                    num_tracks integer,
                                    num_albums integer,
                                    num_followers integer,
                                    num_edits integer,
                                    duration_ms integer,
                                    num_artists integer
                                );""")
                                
        # create tracks table
        cur.execute(""" CREATE TABLE IF NOT EXISTS tracks (
                                    artist_name text,
                                    track_uri text NOT NULL primary key,
                                    artist_uri text,
                                    track_name text NOT NULL,
                                    album_uri text,
                                    album_name text,
                                    track_id integer,
                                    pid integer
                                    ); """)
        cur.execute("""CREATE TABLE IF NOT EXISTS tracks_in_playlist (
                                    pid integer NOT NULL,
                                    track_uri
        );""")
        # create features table
        cur.execute(""" CREATE TABLE IF NOT EXISTS features_by_track (
                                    track_uri text primary key,
                                    danceability real,
                                    energy real,
                                    key real,
                                    loudness real,
                                    mode real,
                                    speechiness real,
                                    acousticness real,
                                    instrumentalness real,
                                    liveness real,
                                    valence real,
                                    tempo real,
                                    duration_ms integer,
                                    time_signature integer
                                    ); """)
        
        cur.execute(""" CREATE TABLE IF NOT EXISTS avg_features_by_playlist (
                                    pid integer NOT NULL primary key,
                                    name NOT NULL,
                                    danceability real,
                                    energy real,
                                    key real,
                                    loudness real,
                                    mode real,
                                    speechiness real,
                                    acousticness real,
                                    instrumentalness real,
                                    liveness real,
                                    valence real,
                                    tempo real,
                                    duration_ms integer,
                                    time_signature integer
        );""")

    except Error as e: 
        print(e)


In [None]:
def process_json_data(json_data, num_playlists, conn):
  # Get all playlists in the file
  playlists_df = pd.json_normalize(json_data['playlists'])
  playlists_df.drop(['tracks', 'description'], axis=1, inplace=True)
  playlists_df['name'].str.lower()
  playlists_df.to_sql(name='playlists', con=conn, if_exists='append', index=False)
  # Get all the tracks in the file
  cur = conn.cursor()
  cur.execute("select max(track_id) from tracks")
  rows = cur.fetchall()
  max_track_id = rows[0][0]
  if max_track_id is None:
      max_track_id = 0
  tracks_df = pd.json_normalize(json_data['playlists'], record_path=['tracks'], meta=['pid', 'num_followers'])
  #tracks_in_playlist_df.to_sql(name="tracks_in_playlist", con=conn, if_exists='append', index=False)
  tracks_df = tracks_df[tracks_df['pid'].isin(playlists_df['pid'].values)]
  tracks_df['track_uri'] = tracks_df['track_uri'].apply(lambda uri: uri.split(':')[2])
  tracks_df['album_uri'] = tracks_df['album_uri'].apply(lambda uri: uri.split(':')[2])
  tracks_df['artist_uri'] = tracks_df['artist_uri'].apply(lambda uri: uri.split(':')[2])

  tracks_in_playlist_df = tracks_df[['pid', 'track_uri']]
  tracks_in_playlist_df.to_sql(name='tracks_in_playlist', con=conn, if_exists='append', index=False)

  all_tracks_df = pd.read_sql('select track_id, track_uri from tracks', conn)
  tracks_df = tracks_df.merge(all_tracks_df, how='left', on='track_uri').fillna(0)
  tracks_df['track_id1'] = tracks_df[tracks_df["track_id"] == 0][['track_uri']].groupby('track_uri').ngroup()+max_track_id+1
  tracks_df['track_id'] = tracks_df['track_id'] + tracks_df['track_id1'].fillna(0)
  tracks_df['track_id'] = tracks_df['track_id'].astype('int64')
  tracks_df = tracks_df[tracks_df['track_id1'].notna()]
  tracks_df.drop(['pos', 'duration_ms', 'pid', 'num_followers', 'track_id1'], axis=1, inplace=True)
  tracks_df = tracks_df.drop_duplicates(subset='track_uri', keep="first")
  tracks_df.to_sql(name='tracks', con=conn, if_exists='append', index=False)


In [None]:
def process_playlists(path, num_files, num_playlists, conn):
    cur = conn.cursor()

    # Don't process playlists that have already been processed
    # This allows for us to add more playlist and song data to the db as we go along with dev
    cur.execute('select count(pid) from playlists')
    count = cur.fetchall()[0][0] / 1000
    if count == num_files:
        return
    playlists = []
    filenames = os.listdir(path)
    for fname in sorted(filenames):
        if fname.startswith("mpd.slice.") and fname.endswith(".json"):
            count += 1
            full_path = os.sep.join((path, fname))
            with open(full_path) as f:
                js = json.loads(f.read())
                print("Processing playlist file", count)
                process_json_data(js, num_playlists, conn)
            if count == num_files and num_files > 0:
                break

In [None]:
def connect_db(db_file):
    conn = None
    try:
        conn = sqlite3.connect(db_file)
    except Error as e:
        print(e)
    return conn

In [None]:
# connect to database
conn = connect_db('data.db')
cur = conn.cursor()

In [None]:
process_playlists(path, 1, 0, conn)

In [None]:
def connect_to_spotify():
    # Spotify credentials
    cid = "5cffc2676cd44b35bc6af81faeb8e69a"
    secret = "f9fdae88362349b992ab2714ea91a094"
    os.environ["SPOTIPY_CLIENT_ID"] = cid
    os.environ["SPOTIPY_CLIENT_SECRET"] = secret
    os.environ['SPOTIPY_REDIRECT_URI'] = "http://localhost:8080"
    client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
    sp = spotipy.Spotify(client_credentials_manager = SpotifyClientCredentials())

    return sp

In [None]:
def get_spotify_features():
    sp = connect_to_spotify()
    cur = conn.cursor()
    # Don't load features for songs that are already loaded
    cur.execute('''select tracks.track_id, tracks.track_uri, tracks.artist_name from tracks 
    where tracks.track_uri not in (select features_by_track.track_uri from features_by_track)''')
    rows = cur.fetchall()
    num_tracks = len(rows)
    if num_tracks != 0:
        print(num_tracks, " songs do not have features loaded yet.")
        uris = [row[1] for row in rows]
        artists = [row[2] for row in rows]
        feats_list = []
        for i in range(0, len(uris), 100):
            feats_list += sp.audio_features(uris[i:(i+100)])
        # Remove None items, for some tracks there are no features
        feats_list = [item for item in feats_list if item]
        features_df = pd.DataFrame(feats_list)
        features = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
        features_df = features_df[features]
        features_df.insert(loc=0, column='track_uri', value=uris)
        features_df.to_sql(name='features_by_track', con=conn, if_exists='append', index=False)

    # get the average features for each playlist and put into another table, along with the playlist name
    # cur.execute('select tracks_in_playlist.pid, playlists.name, features_by_track.* from features_by_track join tracks_in_playlist on tracks_in_playlist.track_uri = features_by_track.track_uri join playlists on tracks_in_playlist.pid = playlists.pid')
    # rows = cur.fetchall()
    
        cur.execute('''insert into avg_features_by_playlist select tracks_in_playlist.pid, playlists.name, AVG(features_by_track.danceability), AVG(features_by_track.energy), 
                    AVG('features_by_track.key'), AVG(features_by_track.loudness), AVG('features_by_track.mode'), AVG(features_by_track.speechiness),
                        AVG(features_by_track.acousticness), AVG(features_by_track.instrumentalness), AVG(features_by_track.liveness), AVG(features_by_track.valence), 
                        AVG(features_by_track.tempo), AVG(features_by_track.duration_ms), AVG(features_by_track.time_signature)
                        from features_by_track join tracks_in_playlist on tracks_in_playlist.track_uri = features_by_track.track_uri join playlists on tracks_in_playlist.pid = playlists.pid group by tracks_in_playlist.pid''')
    elif not cur.execute('select count(pid) from avg_features_by_playlist group by pid').fetchall():
        cur.execute('''insert into avg_features_by_playlist select tracks_in_playlist.pid, playlists.name, AVG(features_by_track.danceability), AVG(features_by_track.energy), 
                    AVG('features_by_track.key'), AVG(features_by_track.loudness), AVG('features_by_track.mode'), AVG(features_by_track.speechiness),
                        AVG(features_by_track.acousticness), AVG(features_by_track.instrumentalness), AVG(features_by_track.liveness), AVG(features_by_track.valence), 
                        AVG(features_by_track.tempo), AVG(features_by_track.duration_ms), AVG(features_by_track.time_signature)
                        from features_by_track join tracks_in_playlist on tracks_in_playlist.track_uri = features_by_track.track_uri join playlists on tracks_in_playlist.pid = playlists.pid group by tracks_in_playlist.pid''')
        

In [None]:
get_spotify_features()
conn.commit()

In [None]:
# Testing to verify that all data was correctly processed

cur = conn.cursor()
cur.execute('select count(pid) from avg_features_by_playlist')
num_pl_avg_f = cur.fetchall()[0][0]

cur.execute('select count(pid) from playlists')
num_pl = cur.fetchall()[0][0]

cur.execute('select count(pid) from tracks_in_playlist group by pid')
num_pl_tip = len(cur.fetchall())
print(num_pl_tip)

cur.execute('select count(track_uri) from features_by_track')
num_trks_fbt = cur.fetchall()[0][0]

cur.execute('select count(track_uri) from tracks')
num_trks = cur.fetchall()[0][0]

if(num_pl == num_pl_avg_f and num_pl_avg_f == num_pl_tip and num_pl > 0):
    print("All playlists were successfully processed")
else:
    print("Error: not all playlists were processed")

if(num_trks == num_trks_fbt):
    print("All songs in the playlists and their features were processed")
else:
    print("Error: not all songs or not all features were processed")

In [None]:
# Get X data (average playlist features)
data_cols = ['pid', 'name', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
feature_cols = ['danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']
test_cols = ['pid', 'track_uri', 'danceability','energy','key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','duration_ms','time_signature']

data_rows = cur.execute('select * from avg_features_by_playlist where pid > (select MIN(pid) from avg_features_by_playlist) order by pid').fetchall()


test_pl = cur.execute('''select tracks_in_playlist.pid, tracks_in_playlist.track_uri, danceability, energy, key, loudness, mode, speechiness, acousticness, instrumentalness, liveness, valence, tempo, duration_ms, time_signature 
from features_by_track left join tracks_in_playlist on
features_by_track.track_uri = tracks_in_playlist.track_uri 
where pid = (select MIN(pid) from avg_features_by_playlist)''').fetchall()

test_data = pd.DataFrame(test_pl, columns=test_cols)
data = pd.DataFrame(data_rows, columns=data_cols)

print(test_data.head())
print(data.head(5))

In [None]:
# Scale the data


y = test_data[feature_cols].mean()

scaler = StandardScaler(with_mean=True, with_std=True).fit(data[feature_cols].values)

scaled_x = scaler.transform(data[feature_cols].values)
scaled_y = scaler.transform(np.array(y).reshape(1,-1))
scaled_features = pd.DataFrame(scaled_x)
tsne = openTSNE(perplexity=30, metric='euclidean', n_jobs=-1, random_state=0, verbose=False)
tsne_transformer = tsne.fit(scaled_x)
data_df = pd.DataFrame(tsne_transformer.transform(scaled_x), columns =['X', 'Y'])

print(data_df)

In [None]:
#calculate how many K clusters there should be 
wcss = [] 
for number_of_clusters in range(1, 30): 
    kmeans = KMeans(n_clusters = number_of_clusters, random_state = 42)
    kmeans.fit(scaled_x) 
    wcss.append(kmeans.inertia_)
wcss

ks = range(1, 30)
plt.plot(ks, wcss)
plt.axvline(18, linestyle='--', color='r')

In [None]:
pca_num_components = 2
reduced = PCA(n_components=pca_num_components, svd_solver='full')
reduced.fit_transform(scaled_x)
print(reduced.explained_variance_ratio_)

In [None]:
# initialize KMeans
kmeans = KMeans(n_clusters=18, random_state=0)
clusters = kmeans.fit(reduced)
labels = clusters.labels_
data_df['cluster'] = pd.Categorical(labels)


In [None]:
score = silhouette_score(reduced, clusters.labels_, metric='euclidean')
print('Silhouetter Score: %.3f' % score)


In [None]:
from sklearn import datasets
from yellowbrick.cluster import KElbowVisualizer

visualizer = KElbowVisualizer(kmeans, k=(2,24))
 
visualizer.fit(reduced)        # Fit the data to the visualizer
visualizer.show() 


In [None]:
range_n_clusters = [16, 18, 20, 22, 24, 26, 28, 30]
silhouette_avg = []
for num_clusters in range_n_clusters:  
  # initialise kmeans
  kmeans = KMeans(n_clusters=num_clusters)
  kmeans.fit(scaled_x)
  cluster_labels = kmeans.labels_
 
  # silhouette score
  silhouette_avg.append(silhouette_score(reduced, cluster_labels))
plt.plot(range_n_clusters,silhouette_avg)
plt.xlabel("Values of K") 
plt.ylabel("Silhouette score") 
plt.title("Silhouette analysis For Optimal k")
plt.show()

In [None]:
visualizer = KElbowVisualizer(kmeans, k=(17, 24))
 
visualizer.fit(scaled_x)        # Fit the data to the visualizer
visualizer.show() 

In [None]:
fig, ax = plt.subplots(2, 2, figsize=(15,8))
for i in [2, 3, 4, 5]:
    '''
    Create KMeans instance for different number of clusters
    '''
    km = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=100, random_state=42)
    q, mod = divmod(i, 2)
    '''
    Create SilhouetteVisualizer instance with KMeans instance
    Fit the visualizer
    '''
    visualizer = SilhouetteVisualizer(km, colors='yellowbrick', ax=ax[q-1][mod])
    visualizer.fit(scaled_x)