In [1]:
import os
import re
import sys
import json
import time
import spotipy
import datetime
import collections
import pandas as pd

from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

In [2]:
from datetime import datetime
from tqdm import tqdm


In [3]:
# Change the path to where your Spotify Million Playlist is located.
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = 'data/spotify_million_playlist_dataset/data'

def loop_slices(path, num_slices=20):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [4]:
def create_csv(playlists, extended=False):
    """
    This function will write a .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('data/MPD_Extended.csv', index=False)        

In [5]:
%%time
#####################################################################################################################
# STOP, STOP, STOP, STOP, STOP                                                                                      #
# num_slices=1000 (all playlists) will take considerable time (>30min) and it will eat all your disk storage 30Gb+  #
# recommended to work with default num_slices=20, this will output 400Mb for each file                              #
#####################################################################################################################

playlists = loop_slices(path, num_slices=20)
create_csv(playlists, extended=True)

mpd.slice.0-999.json
mpd.slice.1000-1999.json
mpd.slice.10000-10999.json
mpd.slice.100000-100999.json
mpd.slice.101000-101999.json
mpd.slice.102000-102999.json
mpd.slice.103000-103999.json
mpd.slice.104000-104999.json
mpd.slice.105000-105999.json
mpd.slice.106000-106999.json
mpd.slice.107000-107999.json
mpd.slice.108000-108999.json
mpd.slice.109000-109999.json
mpd.slice.11000-11999.json
mpd.slice.110000-110999.json
mpd.slice.111000-111999.json
mpd.slice.112000-112999.json
mpd.slice.113000-113999.json
mpd.slice.114000-114999.json
mpd.slice.115000-115999.json
CPU times: user 1min 7s, sys: 4.86 s, total: 1min 12s
Wall time: 1min 18s


In [None]:
# Read MPD
df = pd.read_csv('data/MPD.csv')
df

In [None]:
# Read MPD
df = pd.read_csv('data/MPD_Extended.csv')
df

# Get song features from playlists extracted

In [6]:
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = "e93bb9b7b8aa4ba8a817537a0f7696f8"
os.environ["SPOTIPY_CLIENT_SECRET"] = "bd54ecc86e3b4bc08b33f3bed6e89dee"
os.environ['SPOTIPY_REDIRECT_URI'] = "http://localhost:8080"   # Needed for user authorization
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [30]:
df_list = []
feats_df = pd.read_csv('Playlist_Feats.csv')
idx = len(feats_df)
df_list.append(feats_df)
feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.038462,-4.891212,0.692308,0.103698,0.083674,0.000674,0.187087,0.642750,121.157500,221777.461538,4.000000
1,Awesome Playlist,1,0.492382,0.695923,4.461538,-8.107974,0.538462,0.091010,0.162227,0.223708,0.179344,0.476667,124.987128,298837.641026,3.769231
2,korean,2,0.671062,0.692953,5.000000,-4.875594,0.515625,0.096425,0.269100,0.000638,0.168894,0.565078,114.595984,219373.953125,4.000000
3,mat,3,0.514349,0.620901,5.103175,-9.618754,0.714286,0.067004,0.273514,0.203148,0.188278,0.451258,125.523048,229575.055556,3.952381
4,90s,4,0.576235,0.650418,3.352941,-7.634529,0.823529,0.041218,0.177189,0.081759,0.166524,0.490294,127.725412,255014.352941,3.941176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9157,Gym,106157,0.534879,0.727061,5.333333,-5.763152,0.757576,0.052520,0.133434,0.008525,0.173132,0.505697,127.168500,232338.651515,3.969697
9158,chill,106158,0.710600,0.611800,3.800000,-6.888400,0.500000,0.129800,0.195260,0.087900,0.193550,0.379500,119.845400,201980.100000,4.000000
9159,80s Music,106159,0.653651,0.740007,5.157895,-8.693954,0.743421,0.047501,0.111790,0.049378,0.169228,0.716559,120.838592,255292.046053,4.000000
9160,Get hyped,106160,0.712686,0.680088,5.360825,-6.199005,0.577320,0.182556,0.097947,0.014718,0.234056,0.439419,123.119649,229499.386598,4.000000


In [34]:
%%time

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = []
cnt=0

for playlist in tqdm(playlists[idx:]):
    audio_feats = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        
#         tries = 10
#         for i in range(tries):
        try:
            curr_song_audio_feat = sp.audio_features(uri)[0]
            if curr_song_audio_feat == None:
                print('Empty uri: {}, in playlist: {}'.format(track['track_uri'], playlist['name']))
            else:
                audio_feats.append(curr_song_audio_feat)
        except Exception as e: print(e)
#                 if i == tries:
#                     print('Playlist: {}'.format(playlist['name']))
#                     print('uri: {}'.format(track['track_uri']))
#                     print('Something went wrong, retrying {}'.format(i))
#                     continue
#                 else:
#                     raise
#             break

        cnt+=1
#         if cnt%100==0:
#             time.sleep(1)
    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    s2 = pd.DataFrame(audio_feats)[cols_to_keep].mean()

    dfs.append(pd.DataFrame(s1.append(s2)).T)

Start Time = 09:37:05


  2%|▏         | 164/10718 [20:29<25:29:34,  8.70s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


  2%|▏         | 190/10718 [24:27<21:55:38,  7.50s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


  7%|▋         | 743/10718 [1:23:28<14:02:16,  5.07s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 12%|█▏        | 1301/10718 [2:22:28<18:48:22,  7.19s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 15%|█▍        | 1563/10718 [2:52:58<13:58:05,  5.49s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)
HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 16%|█▌        | 1696/10718 [3:06:46<17:31:21,  6.99s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 17%|█▋        | 1799/10718 [3:18:49<17:35:07,  7.10s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 17%|█▋        | 1827/10718 [3:21:29<9:49:51,  3.98s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 21%|██        | 2255/10718 [4:10:07<8:20:47,  3.55s/it] 

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 22%|██▏       | 2338/10718 [4:20:25<17:03:58,  7.33s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 22%|██▏       | 2371/10718 [4:23:06<8:22:14,  3.61s/it] 

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 23%|██▎       | 2442/10718 [4:30:58<27:37:45, 12.02s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 27%|██▋       | 2857/10718 [5:19:36<12:27:00,  5.70s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 29%|██▊       | 3056/10718 [5:41:38<22:57:10, 10.78s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 31%|███       | 3271/10718 [6:05:25<15:53:01,  7.68s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 31%|███▏      | 3374/10718 [6:18:26<15:33:41,  7.63s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 33%|███▎      | 3562/10718 [6:39:50<21:33:05, 10.84s/it]

Empty uri: spotify:track:6RQ97mq9F7QFRecMxmmdxS, in playlist: Current


 34%|███▍      | 3625/10718 [6:47:20<11:46:50,  5.98s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 34%|███▍      | 3629/10718 [6:47:47<12:24:00,  6.30s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 35%|███▌      | 3769/10718 [7:04:05<11:07:23,  5.76s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 36%|███▋      | 3903/10718 [7:16:56<14:32:42,  7.68s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 36%|███▋      | 3912/10718 [7:17:26<8:23:50,  4.44s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 39%|███▉      | 4176/10718 [7:47:28<21:18:46, 11.73s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 39%|███▉      | 4224/10718 [7:52:25<9:12:20,  5.10s/it] 

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 41%|████▏     | 4445/10718 [8:16:36<15:59:40,  9.18s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 47%|████▋     | 4986/10718 [9:15:38<7:39:10,  4.81s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 52%|█████▏    | 5528/10718 [10:14:30<9:14:50,  6.41s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 56%|█████▋    | 6054/10718 [11:13:41<6:04:19,  4.69s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 58%|█████▊    | 6266/10718 [11:34:58<5:30:01,  4.45s/it] 

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 62%|██████▏   | 6606/10718 [12:12:45<8:50:13,  7.74s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 67%|██████▋   | 7137/10718 [13:11:45<11:03:36, 11.12s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 72%|███████▏  | 7680/10718 [14:10:41<7:19:30,  8.68s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 77%|███████▋  | 8208/10718 [15:09:47<5:03:10,  7.25s/it] 

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 78%|███████▊  | 8318/10718 [15:22:00<2:47:02,  4.18s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 82%|████████▏ | 8736/10718 [16:08:47<3:02:45,  5.53s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 87%|████████▋ | 9272/10718 [17:07:49<4:00:03,  9.96s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 90%|█████████ | 9649/10718 [17:51:38<3:14:32, 10.92s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 91%|█████████ | 9718/10718 [17:59:17<2:20:38,  8.44s/it]

HTTPSConnectionPool(host='api.spotify.com', port=443): Read timed out. (read timeout=5)


 91%|█████████▏| 9781/10718 [18:06:38<1:21:45,  5.23s/it]

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


 96%|█████████▌| 10313/10718 [19:05:51<38:31,  5.71s/it]  

('Connection aborted.', ConnectionResetError(54, 'Connection reset by peer'))


100%|██████████| 10718/10718 [19:50:40<00:00,  6.67s/it]  

CPU times: user 57min 31s, sys: 8min 55s, total: 1h 6min 26s
Wall time: 19h 50min 40s





In [35]:
df1 = pd.concat(dfs, axis=0)
df_list.append(df1)
feats_df = pd.concat(df_list, axis=0)
feats_df.to_csv('data/Playlist_Feats.csv', index=False)
idx = len(feats_df)
feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.03846,-4.89121,0.692308,0.103698,0.0836741,0.000674382,0.187087,0.64275,121.157,221777,4
1,Awesome Playlist,1,0.492382,0.695923,4.46154,-8.10797,0.538462,0.0910103,0.162227,0.223708,0.179344,0.476667,124.987,298838,3.76923
2,korean,2,0.671062,0.692953,5,-4.87559,0.515625,0.096425,0.2691,0.000637812,0.168894,0.565078,114.596,219374,4
3,mat,3,0.514349,0.620901,5.10317,-9.61875,0.714286,0.067004,0.273514,0.203148,0.188278,0.451258,125.523,229575,3.95238
4,90s,4,0.576235,0.650418,3.35294,-7.63453,0.823529,0.0412176,0.177189,0.0817588,0.166524,0.490294,127.725,255014,3.94118
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,gang gang,115995,0.804615,0.529026,4.20513,-8.16418,0.589744,0.219638,0.217947,0.00862146,0.166574,0.363818,129.452,230348,4
0,Chaos,115996,0.6889,0.6955,5.7,-8.23265,0.4,0.104215,0.243258,0.225892,0.15177,0.7178,135.314,283091,4.05
0,Spring 2014,115997,0.646571,0.603214,5.35714,-6.89093,0.714286,0.0787429,0.227477,0.0243475,0.146629,0.442571,125.139,268989,4
0,autumn,115998,0.549943,0.562457,5.11429,-8.66606,0.657143,0.06552,0.273361,0.0180656,0.16158,0.390406,114.145,237815,3.97143
