In [7]:
import os
import re
import sys
import json
import time
import spotipy
import pandas as pd

from tqdm import tqdm
from datetime import datetime
from spotipy.oauth2 import SpotifyClientCredentials, SpotifyOAuth

In [2]:
# Change the path to where your Spotify Million Playlist is located. (Need to download the dataset first from 
# https://www.aicrowd.com/challenges/spotify-million-playlist-dataset-challenge) 
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = 'data/spotify_million_playlist_dataset/data'

def loop_slices(path, num_slices=20):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [3]:
def create_csv(playlists, extended=False):
    """
    This function will write a .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('data/MPD_Extended.csv', index=False)        

In [4]:
%%time
#####################################################################################################################
# STOP, STOP, STOP, STOP, STOP                                                                                      #
# num_slices=1000 (all playlists) will take considerable time (>30min) and it will eat all your disk storage 30Gb+  #
# recommended to work with default num_slices=20, this will output 400Mb for each file                              #
#####################################################################################################################

playlists = loop_slices(path, num_slices=20)
create_csv(playlists, extended=True)

mpd.slice.0-999.json
mpd.slice.1000-1999.json
mpd.slice.10000-10999.json
mpd.slice.100000-100999.json
mpd.slice.101000-101999.json
mpd.slice.102000-102999.json
mpd.slice.103000-103999.json
mpd.slice.104000-104999.json
mpd.slice.105000-105999.json
mpd.slice.106000-106999.json
mpd.slice.107000-107999.json
mpd.slice.108000-108999.json
mpd.slice.109000-109999.json
mpd.slice.11000-11999.json
mpd.slice.110000-110999.json
mpd.slice.111000-111999.json
mpd.slice.112000-112999.json
mpd.slice.113000-113999.json
mpd.slice.114000-114999.json
mpd.slice.115000-115999.json
CPU times: user 38.2 s, sys: 2.34 s, total: 40.5 s
Wall time: 42.4 s


In [None]:
# Read MPD
df = pd.read_csv('data/MPD.csv')
df

In [None]:
# Read MPD_Extended
df = pd.read_csv('data/MPD_Extended.csv')
df

# Get song features from playlists extracted

In [5]:
# Spotify credentials
os.environ["SPOTIPY_CLIENT_ID"] = "e93bb9b7b8aa4ba8a817537a0f7696f8"
os.environ["SPOTIPY_CLIENT_SECRET"] = "bd54ecc86e3b4bc08b33f3bed6e89dee"
os.environ['SPOTIPY_REDIRECT_URI'] = "http://localhost:8080"   # Needed for user authorization
sp = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials())

In [14]:
# Code to retrieve and add data to an existing df
# df_list = []
# feats_df = pd.read_csv('Playlist_Feats.csv')
# idx = len(feats_df)
# df_list.append(feats_df)
# feats_df

In [None]:
%%time

##################################################################################
# STOP - This computation will take ~ 40hrs to finish for every 20,000 playlists #
##################################################################################

################################################################################################################
# Code to retrieve song features from each playlist passed, average of song features in each playlist is       #
# computed in such a way that the final computation is each playlist is a row and it has the average of        #
# all songs in the playlist.                                                                                   #
# This code can be optimized in different ways:                                                                #
#      - Remove outliers from each playlist                                                                    #
#      - Keep only those playlist where the song variation is within 3 standard deviation for all features     #
#      - Add for loop to keep trying in case of an exception occurs while extracting features.                 #
################################################################################################################

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

cols_to_keep = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 
                'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms', 'time_signature']
dfs = []

for playlist in tqdm(playlists):
    audio_feats = []
    for track in playlist['tracks']:
        uri = track['track_uri'].split("k:")[1]
        
        try:
            curr_song_audio_feat = sp.audio_features(uri)[0]
            if curr_song_audio_feat == None:
                print('Empty uri: {}, in playlist: {}'.format(track['track_uri'], playlist['name']))
            else:
                audio_feats.append(curr_song_audio_feat)
        except Exception as e: print(e)

    name = playlist['name']
    pid = playlist['pid']
    s1 = pd.Series([name, pid], index=['name', 'pid'])
    s2 = pd.DataFrame(audio_feats)[cols_to_keep].mean()

    dfs.append(pd.DataFrame(s1.append(s2)).T)

In [15]:
df1 = pd.concat(dfs, axis=0)
df1.to_csv('data/Playlist_Feats.csv', index=False)
df1

# Code to add data on top of an existing df
# df_list.append(df1)
# feats_df = pd.concat(df_list, axis=0)
# feats_df.to_csv('data/Playlist_Feats_Copy.csv', index=False)
# idx = len(feats_df)
# feats_df

Unnamed: 0,name,pid,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Throwbacks,0,0.664077,0.781077,5.03846,-4.89121,0.692308,0.103698,0.0836741,0.000674382,0.187087,0.64275,121.158,221777,4.0
0,Awesome Playlist,1,0.492382,0.695923,4.46154,-8.10797,0.538462,0.0910103,0.162227,0.223708,0.179344,0.476667,124.987,298838,3.76923
0,korean,2,0.671062,0.692953,5.0,-4.87559,0.515625,0.096425,0.2691,0.000637812,0.168894,0.565078,114.596,219374,4.0
0,mat,3,0.514349,0.620901,5.10317,-9.61875,0.714286,0.067004,0.273514,0.203148,0.188278,0.451258,125.523,229575,3.95238
