In [None]:
import pandas as pd
import sys
import json
import re
import collections
import os
import datetime

In [None]:
# Change the path to where your Spotify Million Playlist is located.
# The expected file structure is to have a data folder /data .csv files will be placed there.

path = 'data/spotify_million_playlist_dataset/data'

def loop_slices(path, num_slices=20):
    """
    Each slice is a .json file containing 1000 playlists i.e.: 1 slice is 1000 playlists 20 slices is: 20,000 playlists.
    Parameters:
        num_slices (int): Number of slices to return, max 1000.
        path (str): Path to the Spotify Million Playlist.
        
    Output:
        mpd_playlists (list): a list of dictionaries of all the playlists.
    """
    cnt=0
    mpd_playlists = []
    filenames = os.listdir(path)
    for filename in sorted(filenames):
        print(filename)
        cnt+=1
        if filename.startswith("mpd.slice.") and filename.endswith(".json"):
            fullpath = os.sep.join((path, filename))
            f = open(fullpath)
            js = f.read()
            f.close()
            current_slice = json.loads(js)
            
            # Create a list of all playlists
            for playlist in current_slice['playlists']:
                mpd_playlists.append(playlist)


            if cnt == num_slices:
                break
    return mpd_playlists

In [None]:
def create_csv(playlists, extended=False):
    """
    This function will write a .csv file for all the input playlists, this .csv will have a single cell with all the tracks
    in the playlist. 
    An extended parameter is available to extend the tracks in a list to be a single cell per song, this will return an 
    additional .csv file
    Parameters:
        playlists (list): a list of dictionaries such as that from the loop_slices() function.
        extended (boolean): boolean to enable the extended .csv file generation
        
    Output:
        MPD.csv: .csv file with the playlists
        MPD.csv: .csv file with extended song columns
    """
    df = pd.DataFrame(playlists)
    df.to_csv('data/MPD.csv', index=False)
    
    if extended:
        df_list = []
        for playlist in playlists:
            df_list.append(pd.DataFrame(playlist))
            
        df_extended = pd.concat(df_list, axis=0)
         
        cols_to_keep = ['name', 'collaborative', 'pid', 'modified_at', 'num_tracks', 'num_albums', 
                'num_followers','num_edits', 'duration_ms', 'num_artists']
        df_extended = df_extended.reset_index().pivot(values='tracks',index=cols_to_keep, columns='index')
        df_extended.reset_index(inplace=True)
        df_extended = df_extended.rename_axis(None, axis=1)    
        df_extended.sort_values('pid', inplace=True)
        df_extended.to_csv('data/MPD_Extended.csv', index=False)        

In [None]:
%%time
#####################################################################################################################
# STOP, STOP, STOP, STOP, STOP                                                                                      #
# num_slices=1000 (all playlists) will take considerable time (>30min) and it will eat all your disk storage 30Gb+  #
# recommended to work with default num_slices=20, this will output 400Mb for each file                              #
#####################################################################################################################

playlists = loop_slices(path, num_slices=20)
create_csv(playlists, extended=True)

In [None]:
# Read MPD
df = pd.read_csv('data/MPD.csv')
df

In [None]:
# Read MPD
df = pd.read_csv('data/MPD_Extended.csv')
df