# Data Constructor

In [1]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import joblib
import os

In [None]:
# set these environment variables with your own spotify credentials
os.environ['SPOTIPY_CLIENT_ID'] = 
os.environ['SPOTIPY_CLIENT_SECRET'] = 

In [3]:
spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=os.environ['SPOTIPY_CLIENT_ID'],
                                                                              client_secret=os.environ['SPOTIPY_CLIENT_SECRET']))

### Functions definitions

In [12]:
"""
Take the uri of a playlist (of the website http://everynoise.com/everynoise1d.cgi?scope=all) and returns 2 list:
- the first contains the uri of each track
- the second contains, for each track, the list of associated genres (the genres of the author of the track).
"""
def process_playlist(playlist_uri):
    tracks_uri = []
    tracks_genres = []
    total_tracks = spotify.playlist(playlist_uri)['tracks']['total']
    
    for offset in range(0, total_tracks, 100):
        for track in spotify.playlist_items(playlist_id=playlist_uri, offset=offset)['items']:
            track_uri = track['track']['uri']
            artist_uri = track['track']['artists'][0]['uri']
            track_genres = spotify.artist(artist_uri)['genres']
            
            tracks_uri.append(track_uri)
            tracks_genres.append(track_genres)
    return tracks_uri, tracks_genres

In [13]:
"""
This function takes a playlist uri as argument and return a data set with its tracks, their genres, their audio features and
their audio analysis features.
"""
def playlist_to_dataset(playlist_uri):
    tracks_uri, tracks_genres = process_playlist(playlist_uri)
    
    # audio analysis
    num_samples, duration, sample_md5, offset_seconds, window_seconds = [], [], [], [], []
    analysis_sample_rate, analysis_channels, end_of_fade_in, start_of_fade_out, loudness = [], [], [], [], []
    tempo, tempo_confidence, time_signature, time_signature_confidence, key = [], [], [], [], []
    key_confidence, mode, mode_confidence = [], [], []
    bars, beats, sections, segments, tatums = [], [], [], [], []
    # we don't take into account the next 2 lines because these features don't seem interesting
    #codestring, code_version, echoprintstring = [], [], []
    #echoprint_version, synchstring, synch_version, rhythmstring, rhythm_version = [], [], [], [], []
    
    # audio features
    danceability, energy = [], []
    key, loudness, mode = [], [], []
    speechiness, acousticness, instrumentalness, liveness, valence = [], [], [], [], []
    tempo, duration_ms, time_signature = [], [], []
    
    for i, uri in enumerate(tracks_uri):
        audio_analysis = spotify.audio_analysis(uri)
        audio_features = spotify.audio_features(uri)[0]
    
        #bars.append(audio_analysis['bars'])
        #beats.append(audio_analysis['beats'])
        sections.append(audio_analysis['sections'])
        segments.append(audio_analysis['segments'])
        #tatums.append(audio_analysis['tatums'])
    
        #num_samples.append(audio_analysis['track']['num_samples'])
        #duration.append(audio_analysis['track']['duration'])
        #sample_md5.append(audio_analysis['track']['sample_md5'])
        #offset_seconds.append(audio_analysis['track']['offset_seconds'])
        #window_seconds.append(audio_analysis['track']['window_seconds'])
        #analysis_sample_rate.append(audio_analysis['track']['analysis_sample_rate'])
        #analysis_channels.append(audio_analysis['track']['analysis_channels'])
        #end_of_fade_in.append(audio_analysis['track']['end_of_fade_in'])
        #start_of_fade_out.append(audio_analysis['track']['start_of_fade_out'])
        #loudness.append(audio_analysis['track']['loudness'])
        #tempo.append(audio_analysis['track']['tempo'])
        #tempo_confidence.append(audio_analysis['track']['tempo_confidence'])
        #time_signature.append(audio_analysis['track']['time_signature'])
        #time_signature_confidence.append(audio_analysis['track']['time_signature_confidence'])
        #key.append(audio_analysis['track']['key'])
        #key_confidence.append(audio_analysis['track']['key_confidence'])
        #mode.append(audio_analysis['track']['mode'])
        #mode_confidence.append(audio_analysis['track']['mode_confidence'])
        #codestring.append(audio_analysis['track'][])
        #code_version.append(audio_analysis['track'][])
        #echoprintstring.append(audio_analysis['track'][])
        #echoprint_version.append(audio_analysis['track'][])
        #synchstring.append(audio_analysis['track'][])
        #synch_version.append(audio_analysis['track'][])
        #rhythmstring.append(audio_analysis['track'][])
        #rhythm_version.append(audio_analysis['track'][])
    
        #the lines commented correspond to duplicates
        danceability.append(audio_features['danceability'])
        energy.append(audio_features['energy'])
        key.append(audio_features['key'])
        loudness.append(audio_features['loudness'])
        mode.append(audio_features['mode'])
        speechiness.append(audio_features['speechiness'])
        acousticness.append(audio_features['acousticness'])
        instrumentalness.append(audio_features['instrumentalness'])
        liveness.append(audio_features['liveness'])
        valence.append(audio_features['valence'])
        tempo.append(audio_features['tempo'])
        duration_ms.append(audio_features['duration_ms'])
        time_signature.append(audio_features['time_signature'])
        
        if i%50 == 0:
            print("Done:", i)
           
    data = pd.DataFrame({'track_uri': tracks_uri, 'track_genres': tracks_genres,
                         'danceability':danceability, 'energy':energy, 'speechiness':speechiness,
                         'acousticness':acousticness, 'instrumentalness':instrumentalness,
                         'liveness':liveness, 'valence':valence, 'tempo':tempo, 'mode':mode,
                         'duration_ms':duration_ms, 'time_signature':time_signature, 'key':key,
                         'sections':sections, 'segments':segments, 'loudness':loudness
                         #'num_samples':num_samples, 'duration':duration, 'sample_md5':sample_md5,
                         #'offset_seconds':offset_seconds, 'window_seconds':window_seconds,
                         #'analysis_sample_rate':analysis_sample_rate, 'analysis_channels':analysis_channels,
                         #'end_of_fade_in':end_of_fade_in, 'start_of_fade_out':start_of_fade_out,
                         #'tempo':tempo, 'tempo_confidence':tempo_confidence,
                         #'time_signature':time_signature, 'time_signature_confidence':time_signature_confidence,
                         #'key':key, 'key_confidence':key_confidence, 'mode':mode, 'mode_confidence':mode_confidence,
                         #'bars':bars, 'beats':beats, 'tatums':tatums
                        })
        
    return data

In [14]:
def merge_playlists_data(playlists_uri):
    
    data = []
    for playlist_uri in playlists_uri:
        data.append(playlist_to_dataset(playlist_uri))
        print("One playlist done")
    
    dataset = pd.concat(data, axis=0, ignore_index=True)
    dataset = dataset.drop_duplicates(subset='track_uri').reset_index()
    
    return dataset

### Build the dataset

In [15]:
"""
# uri of the first 10 playlists of the website (~10000 songs but ~7000 different songs)
# uncomment if you want to build the full dataset
playlists_uri = ['6gS3HhOiI17QNojjPuPzqc',
                 '2ZIRxkFuqNPMnlY7vL54uK',
                 #'6s5MoZzR70Qef7x4bVxDO1',
                 #'2HhaArHsOiofpUheCRPkLa',
                 #'10FCW9lj0NdeoYI5VVvVtY',
                 #'7dowgSWOmvdpwNkGFMUs6e',
                 #'5SrYLEPXnsfmK4ZuOCIKKm',
                 #'1IGB0Uz7x2VY28qMagUC24',
                 #'6MXkE0uYF4XwU4VTtyrpfP',
                 #'7MIkj5EbBCaUutUBEfGpEJ'
                ]
"""

# 10 playlists with quite various genres
playlists_uri = ['6gS3HhOiI17QNojjPuPzqc',
                 '6s5MoZzR70Qef7x4bVxDO1',
                 '7dowgSWOmvdpwNkGFMUs6e',
                 '3pDxuMpz94eDs7WFqudTbZ',
                 '3HYK6ri0GkvRcM6GkKh0hJ',
                 '1IGB0Uz7x2VY28qMagUC24',
                 '0zJrEnj3O8CohOpFFUVSo9'
                ]

In [16]:
dataset = merge_playlists_data(playlists_uri)

Done: 0
Done: 50
Done: 100
Done: 150
Done: 200
Done: 250
Done: 300
Done: 350
Done: 400
Done: 450
Done: 500
Done: 550
Done: 600
Done: 650
Done: 700
Done: 750
Done: 800
Done: 850
Done: 900
Done: 950
Done: 1000
Done: 1050
One playlist done
Done: 0
Done: 50
Done: 100
Done: 150
Done: 200
Done: 250
Done: 300
Done: 350
Done: 400
Done: 450
Done: 500
Done: 550
Done: 600
Done: 650
Done: 700
Done: 750
Done: 800
Done: 850
Done: 900
One playlist done
Done: 0
Done: 50
Done: 100
Done: 150
Done: 200
Done: 250
Done: 300
Done: 350
Done: 400
Done: 450
Done: 500
Done: 550
Done: 600
Done: 650
Done: 700
Done: 750
Done: 800
Done: 850
Done: 900
Done: 950
Done: 1000
Done: 1050
Done: 1100
Done: 1150
One playlist done
Done: 0
Done: 50
Done: 100
Done: 150
Done: 200
Done: 250
Done: 300
Done: 350
Done: 400
Done: 450
Done: 500
Done: 550
Done: 600
Done: 650
Done: 700
Done: 750
Done: 800
Done: 850
Done: 900
Done: 950
One playlist done
Done: 0
Done: 50
Done: 100
Done: 150
Done: 200
Done: 250
Done: 300
Done: 350
Done: 4

In [17]:
dataset.to_csv('spotify_dataset.csv', sep=';')

In [18]:
dataset

Unnamed: 0,index,track_uri,track_genres,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,tempo,mode,duration_ms,time_signature,key,sections,segments,loudness
0,0,spotify:track:0prNGof3XqfTvNDxHonvdK,"[canadian contemporary r&b, canadian pop, danc...",0.573,0.739,0.1290,0.028500,0.000000,0.1110,0.4510,97.085,1,230227,4,0,"[{'start': 0.0, 'duration': 22.66574, 'confide...","[{'start': 0.0, 'duration': 0.1, 'confidence':...",-5.740
1,1,spotify:track:3yOlyBJuViE2YSGn3nVE1K,"[dance pop, electropop, pop, pop dance, post-t...",0.724,0.491,0.0296,0.018000,0.000013,0.0887,0.3830,105.046,1,170746,4,8,"[{'start': 0.0, 'duration': 5.00813, 'confiden...","[{'start': 0.0, 'duration': 0.12776, 'confiden...",-6.024
2,2,spotify:track:4l0Mvzj72xxOpRrp6h8nHi,"[dance pop, pop, pop dance, post-teen pop]",0.488,0.343,0.0436,0.556000,0.000000,0.2100,0.0978,102.819,1,206459,4,4,"[{'start': 0.0, 'duration': 15.46056, 'confide...","[{'start': 0.0, 'duration': 0.24844, 'confiden...",-8.985
3,3,spotify:track:3e7sxremeOE3wTySiOhGiP,"[dance pop, electropop, pop, pop dance, post-t...",0.259,0.437,0.0386,0.102000,0.000001,0.1060,0.0951,180.042,0,239000,4,11,"[{'start': 0.0, 'duration': 10.22381, 'confide...","[{'start': 0.0, 'duration': 0.20308, 'confiden...",-6.589
4,4,spotify:track:4tCtwWceOPWzenK2HAIJSb,"[dance pop, electropop, girl group, pop, pop d...",0.803,0.585,0.0432,0.103000,0.000004,0.0644,0.5930,105.017,1,214480,4,8,"[{'start': 0.0, 'duration': 6.37032, 'confiden...","[{'start': 0.0, 'duration': 0.07633, 'confiden...",-5.861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6096,6343,spotify:track:4VYHaLVaCspuhjHEmfXtL3,"[alternative metal, nu metal, post-grunge]",0.554,0.979,0.0995,0.000916,0.057400,0.0292,0.2250,136.032,0,236293,4,9,"[{'start': 0.0, 'duration': 41.57958, 'confide...","[{'start': 0.0, 'duration': 0.98617, 'confiden...",-4.804
6097,6344,spotify:track:0QRxtcxL31dRAeiUUuENPu,"[alternative metal, brazilian groove metal, br...",0.335,0.946,0.1260,0.000278,0.030500,0.1360,0.3050,153.429,1,287480,4,2,"[{'start': 0.0, 'duration': 8.16704, 'confiden...","[{'start': 0.0, 'duration': 0.222, 'confidence...",-6.233
6098,6345,spotify:track:1KCAEG6JcsTzi8ddZzs0nk,"[alternative metal, melodic metalcore, metalco...",0.295,0.980,0.0854,0.000032,0.000000,0.3590,0.2830,134.335,0,254613,4,2,"[{'start': 0.0, 'duration': 12.28979, 'confide...","[{'start': 0.0, 'duration': 0.1449, 'confidenc...",-4.075
6099,6346,spotify:track:6uqFHC4QCqdMgEe34JuVCO,"[alternative metal, glam metal, hard rock, nu ...",0.529,0.770,0.0289,0.032600,0.000000,0.2000,0.3810,130.841,1,227120,4,2,"[{'start': 0.0, 'duration': 8.34365, 'confiden...","[{'start': 0.0, 'duration': 0.52195, 'confiden...",-3.596
