# Data Constructor

In [85]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import joblib

In [86]:
SPOTIPY_CLIENT_ID = 'd5a0f30e90834cccb74601b7211e2b1a'
SPOTIPY_CLIENT_SECRET = '2428249314284ca395cc7313c7d32d9e'

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=SPOTIPY_CLIENT_ID,
                                                                              client_secret=SPOTIPY_CLIENT_SECRET))

### Functions definitions

In [87]:
"""
Take the uri of a playlist (of the website http://everynoise.com/everynoise1d.cgi?scope=all) and returns 2 list:
- the first contains the uri of each track
- the second contains, for each track, the list of associated genres (the genres of the author of the track).
"""
def process_playlist(playlist_uri):
    tracks_uri = []
    tracks_genres = []
    total_tracks = spotify.playlist(playlist_uri)['tracks']['total']
    
    for offset in range(0, total_tracks, 100):
        for track in spotify.playlist_items(playlist_id=playlist_uri, offset=offset)['items']:
            track_uri = track['track']['uri']
            artist_uri = track['track']['artists'][0]['uri']
            track_genres = spotify.artist(artist_uri)['genres']
            
            tracks_uri.append(track_uri)
            tracks_genres.append(track_genres)
    return tracks_uri, tracks_genres

In [90]:
"""
This function takes a playlist uri as argument and return a data set with its tracks, their genres, their audio features and
their audio analysis features.
"""
def playlist_to_dataset(playlist_uri):
    tracks_uri, tracks_genres = process_playlist(playlist_uri)
    
    # audio analysis
    num_samples, duration, sample_md5, offset_seconds, window_seconds = [], [], [], [], []
    analysis_sample_rate, analysis_channels, end_of_fade_in, start_of_fade_out, loudness = [], [], [], [], []
    tempo, tempo_confidence, time_signature, time_signature_confidence, key = [], [], [], [], []
    key_confidence, mode, mode_confidence = [], [], []
    bars, beats, sections, segments, tatums = [], [], [], [], []
    # we don't take into account the next 2 lines because these features don't seem interesting
    #codestring, code_version, echoprintstring = [], [], []
    #echoprint_version, synchstring, synch_version, rhythmstring, rhythm_version = [], [], [], [], []
    
    # audio features
    danceability, energy = [], []
    key, loudness, mode = [], [], []
    speechiness, acousticness, instrumentalness, liveness, valence = [], [], [], [], []
    tempo, duration_ms, time_signature = [], [], []
    
    for i, uri in enumerate(tracks_uri):
        audio_analysis = spotify.audio_analysis(uri)
        audio_features = spotify.audio_features(uri)[0]
    
        #bars.append(audio_analysis['bars'])
        #beats.append(audio_analysis['beats'])
        sections.append(audio_analysis['sections'])
        segments.append(audio_analysis['segments'])
        #tatums.append(audio_analysis['tatums'])
    
        #num_samples.append(audio_analysis['track']['num_samples'])
        #duration.append(audio_analysis['track']['duration'])
        #sample_md5.append(audio_analysis['track']['sample_md5'])
        #offset_seconds.append(audio_analysis['track']['offset_seconds'])
        #window_seconds.append(audio_analysis['track']['window_seconds'])
        #analysis_sample_rate.append(audio_analysis['track']['analysis_sample_rate'])
        #analysis_channels.append(audio_analysis['track']['analysis_channels'])
        #end_of_fade_in.append(audio_analysis['track']['end_of_fade_in'])
        #start_of_fade_out.append(audio_analysis['track']['start_of_fade_out'])
        #loudness.append(audio_analysis['track']['loudness'])
        #tempo.append(audio_analysis['track']['tempo'])
        #tempo_confidence.append(audio_analysis['track']['tempo_confidence'])
        #time_signature.append(audio_analysis['track']['time_signature'])
        #time_signature_confidence.append(audio_analysis['track']['time_signature_confidence'])
        #key.append(audio_analysis['track']['key'])
        #key_confidence.append(audio_analysis['track']['key_confidence'])
        #mode.append(audio_analysis['track']['mode'])
        #mode_confidence.append(audio_analysis['track']['mode_confidence'])
        #codestring.append(audio_analysis['track'][])
        #code_version.append(audio_analysis['track'][])
        #echoprintstring.append(audio_analysis['track'][])
        #echoprint_version.append(audio_analysis['track'][])
        #synchstring.append(audio_analysis['track'][])
        #synch_version.append(audio_analysis['track'][])
        #rhythmstring.append(audio_analysis['track'][])
        #rhythm_version.append(audio_analysis['track'][])
    
        #the lines commented correspond to duplicates
        danceability.append(audio_features['danceability'])
        energy.append(audio_features['energy'])
        key.append(audio_features['key'])
        loudness.append(audio_features['loudness'])
        mode.append(audio_features['mode'])
        speechiness.append(audio_features['speechiness'])
        acousticness.append(audio_features['acousticness'])
        instrumentalness.append(audio_features['instrumentalness'])
        liveness.append(audio_features['liveness'])
        valence.append(audio_features['valence'])
        tempo.append(audio_features['tempo'])
        duration_ms.append(audio_features['duration_ms'])
        time_signature.append(audio_features['time_signature'])
        
        if i%50 == 0:
            print("Done:", i)
           
    data = pd.DataFrame({'track_uri': tracks_uri, 'track_genres': tracks_genres,
                         'danceability':danceability, 'energy':energy, 'speechiness':speechiness,
                         'acousticness':acousticness, 'instrumentalness':instrumentalness,
                         'liveness':liveness, 'valence':valence, 'tempo':tempo, 'mode':mode,
                         'duration_ms':duration_ms, 'time_signature':time_signature, 'key':key,
                         'sections':sections, 'segments':segments, 'loudness':loudness
                         #'num_samples':num_samples, 'duration':duration, 'sample_md5':sample_md5,
                         #'offset_seconds':offset_seconds, 'window_seconds':window_seconds,
                         #'analysis_sample_rate':analysis_sample_rate, 'analysis_channels':analysis_channels,
                         #'end_of_fade_in':end_of_fade_in, 'start_of_fade_out':start_of_fade_out,
                         #'tempo':tempo, 'tempo_confidence':tempo_confidence,
                         #'time_signature':time_signature, 'time_signature_confidence':time_signature_confidence,
                         #'key':key, 'key_confidence':key_confidence, 'mode':mode, 'mode_confidence':mode_confidence,
                         #'bars':bars, 'beats':beats, 'tatums':tatums
                        })
        
    return data

In [91]:
def merge_playlists_data(playlists_uri):
    
    data = []
    for playlist_uri in playlists_uri:
        data.append(playlist_to_dataset(playlist_uri))
        print("One playlist done")
    
    dataset = pd.concat(data, axis=0, ignore_index=True)
    dataset = dataset.drop_duplicates(subset='track_uri').reset_index()
    
    return dataset

### Build the dataset

In [92]:
"""
# uri of the first 10 playlists of the website (~10000 songs but ~7000 different songs)
# uncomment if you want to build the full dataset
playlists_uri = ['6gS3HhOiI17QNojjPuPzqc',
                 '2ZIRxkFuqNPMnlY7vL54uK',
                 #'6s5MoZzR70Qef7x4bVxDO1',
                 #'2HhaArHsOiofpUheCRPkLa',
                 #'10FCW9lj0NdeoYI5VVvVtY',
                 #'7dowgSWOmvdpwNkGFMUs6e',
                 #'5SrYLEPXnsfmK4ZuOCIKKm',
                 #'1IGB0Uz7x2VY28qMagUC24',
                 #'6MXkE0uYF4XwU4VTtyrpfP',
                 #'7MIkj5EbBCaUutUBEfGpEJ'
                ]
"""

# 10 playlists with quite various genres
playlists_uri = ['6gS3HhOiI17QNojjPuPzqc',
                 '6s5MoZzR70Qef7x4bVxDO1',
                 '7dowgSWOmvdpwNkGFMUs6e',
                 '3pDxuMpz94eDs7WFqudTbZ',
                 '3HYK6ri0GkvRcM6GkKh0hJ',
                 '1IGB0Uz7x2VY28qMagUC24',
                 '0zJrEnj3O8CohOpFFUVSo9']

In [93]:
dataset = merge_playlists_data(playlists_uri)

One playlist done
One playlist done


In [33]:
dataset.to_csv('spotify_dataset.csv', sep=';')

In [34]:
dataset

Unnamed: 0,track_uri,track_genres,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,num_samples,...,time_signature_confidence,key,key_confidence,mode,mode_confidence,bars,beats,sections,segments,tatums
0,spotify:track:0prNGof3XqfTvNDxHonvdK,"[canadian contemporary r&b, canadian pop, danc...",0.573,0.739,0.1290,0.02850,0.000000,0.1110,0.4510,5076498,...,1.000,0,0.322,1,0.411,"[{'start': 1.94072, 'duration': 2.46921, 'conf...","[{'start': 0.04098, 'duration': 0.65377, 'conf...","[{'start': 0.0, 'duration': 22.66574, 'confide...","[{'start': 0.0, 'duration': 0.1, 'confidence':...","[{'start': 0.04098, 'duration': 0.32689, 'conf..."
1,spotify:track:3yOlyBJuViE2YSGn3nVE1K,"[dance pop, electropop, pop, pop dance, post-t...",0.724,0.491,0.0296,0.01800,0.000013,0.0887,0.3830,3764960,...,1.000,8,0.543,1,0.516,"[{'start': 0.43715, 'duration': 2.25695, 'conf...","[{'start': 0.43715, 'duration': 0.55564, 'conf...","[{'start': 0.0, 'duration': 5.00813, 'confiden...","[{'start': 0.0, 'duration': 0.12776, 'confiden...","[{'start': 0.43715, 'duration': 0.27782, 'conf..."
2,spotify:track:4l0Mvzj72xxOpRrp6h8nHi,"[dance pop, pop, pop dance, post-teen pop]",0.488,0.343,0.0436,0.55600,0.000000,0.2100,0.0978,4552413,...,0.875,4,0.599,1,0.529,"[{'start': 2.06318, 'duration': 2.31226, 'conf...","[{'start': 0.28703, 'duration': 0.6184, 'confi...","[{'start': 0.0, 'duration': 15.46056, 'confide...","[{'start': 0.0, 'duration': 0.24844, 'confiden...","[{'start': 0.28703, 'duration': 0.3092, 'confi..."
3,spotify:track:3e7sxremeOE3wTySiOhGiP,"[dance pop, electropop, pop, pop dance, post-t...",0.259,0.437,0.0386,0.10200,0.000001,0.1060,0.0951,5269950,...,0.951,11,0.296,0,0.612,"[{'start': 1.10337, 'duration': 1.33454, 'conf...","[{'start': 0.42937, 'duration': 0.33374, 'conf...","[{'start': 0.0, 'duration': 10.22381, 'confide...","[{'start': 0.0, 'duration': 0.20308, 'confiden...","[{'start': 0.42937, 'duration': 0.16687, 'conf..."
4,spotify:track:4tCtwWceOPWzenK2HAIJSb,"[dance pop, electropop, girl group, pop, pop d...",0.803,0.585,0.0432,0.10300,0.000004,0.0644,0.5930,4729284,...,1.000,8,0.522,1,0.641,"[{'start': 1.22283, 'duration': 2.28352, 'conf...","[{'start': 0.0776, 'duration': 0.5738, 'confid...","[{'start': 0.0, 'duration': 6.37032, 'confiden...","[{'start': 0.0, 'duration': 0.07633, 'confiden...","[{'start': 0.0776, 'duration': 0.2869, 'confid..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1240,spotify:track:6iudA5joUv7hmPQYFIVEB5,"[dance pop, electropop, indie poptimism, metro...",0.566,0.619,0.0266,0.34500,0.000002,0.1210,0.1750,5280534,...,0.929,6,0.490,1,0.635,"[{'start': 0.53223, 'duration': 3.083, 'confid...","[{'start': 0.53223, 'duration': 0.76999, 'conf...","[{'start': 0.0, 'duration': 8.22928, 'confiden...","[{'start': 0.0, 'duration': 0.11624, 'confiden...","[{'start': 0.53223, 'duration': 0.38499, 'conf..."
1241,spotify:track:1hBM2D1ULT3aeKuddSwPsK,"[dance pop, electropowerpop, pop, pop punk, po...",0.607,0.805,0.0608,0.00175,0.000000,0.2310,0.2320,4468800,...,1.000,11,0.273,0,0.384,"[{'start': 0.39307, 'duration': 1.71709, 'conf...","[{'start': 0.39307, 'duration': 0.42568, 'conf...","[{'start': 0.0, 'duration': 8.12096, 'confiden...","[{'start': 0.0, 'duration': 0.37782, 'confiden...","[{'start': 0.39307, 'duration': 0.21284, 'conf..."
1242,spotify:track:0p0ljM6RxgpGt7wthGqBZa,"[big room, dance pop, edm, electro house, elec...",0.656,0.729,0.0765,0.07890,0.000000,0.1330,0.4380,4757976,...,1.000,5,0.163,0,0.436,"[{'start': 1.09829, 'duration': 2.54628, 'conf...","[{'start': 0.43653, 'duration': 0.66176, 'conf...","[{'start': 0.0, 'duration': 12.39889, 'confide...","[{'start': 0.0, 'duration': 0.3249, 'confidenc...","[{'start': 0.43653, 'duration': 0.33088, 'conf..."
1243,spotify:track:3JMAdPq5TUOKBGsTATjLEH,"[dance pop, electropop, pop, pop rap, pop rock...",0.554,0.767,0.0497,0.11400,0.000000,0.2310,0.4970,4977714,...,0.989,0,0.601,1,0.561,"[{'start': 1.08976, 'duration': 2.10187, 'conf...","[{'start': 0.53183, 'duration': 0.55793, 'conf...","[{'start': 0.0, 'duration': 13.07205, 'confide...","[{'start': 0.0, 'duration': 0.39184, 'confiden...","[{'start': 0.53183, 'duration': 0.27897, 'conf..."
