In [32]:
import pandas as pd
import json
import requests
import os
from dotenv import load_dotenv

load_dotenv()
API_TOKEN = str(os.getenv('API_TOKEN'))

In [33]:
def read_json_file(path: str=None, url: str=None) -> dict:
    # with open(path, 'r') as f:
    #     return json.load(f)
    response = requests.get(url)
    data = response.json()
    return data

In [34]:
path = 'https://raw.githubusercontent.com/Ciencia-de-Dados-IMT-2023/Datasets/main/spotify_million_playlist_dataset/mpd.slice.0-999.json'
data = read_json_file(url=path)

In [35]:
df_musicas = pd.DataFrame()

# Itera sobre as playlists
for playlist in data['playlists']:
    name_playlist = playlist['name']
    id_playlist = playlist['pid']

    print(f'{id_playlist} - Playlist: {name_playlist}')

    # Itera sobre as músicas da playlist
    musicas_playlist = playlist['tracks']

    for musica in musicas_playlist:
        name_musica = musica['track_name']
        id_musica = musica['track_uri']

        # print(f'\tMúsica: {name_musica} - ID: {id_musica}')
        df_musica_aux = pd.DataFrame.from_dict(musica, orient='index')
        df_musica_aux = df_musica_aux.T

        # Adiciona o nome da playlist e o id da playlist


        df_musica_aux['playlist_name'] = name_playlist
        df_musica_aux['playlist_id'] = id_playlist

        df_musica_aux = df_musica_aux.T
        df_musicas = pd.concat([df_musicas, df_musica_aux], axis=1)
        
        # break
    break

df_musicas = df_musicas.T
# df_musicas.to_csv('data/df_musicas_parte_1.csv', index=False)

0 - Playlist: Throwbacks


In [36]:
# scripts/api_spotify.py

def make_req(route: str, id: str) -> requests.models.Response:
    url = f'https://api.spotify.com/v1/{route}/{id}'

    headers = {
        'Authorization': f'Bearer {API_TOKEN}'
    }

    response = requests.get(url, headers=headers)

    if response.status_code != 200:
        print(f'Erro: {response.status_code}')
        print(f'Erro: {response.json()}')
        import sys
        sys.exit(1)

    else:
        return response


def get_track(id_track: str) -> dict:

    return make_req('tracks', id_track).json()


def get_album(id_album: str) -> dict:

    return make_req('albums', id_album).json()


def get_artist_data(id_artist: str) -> dict:

    return make_req('artists', id_artist).json()


def get_audio_features(id_track: str) -> dict:

    return make_req('audio-features', id_track).json()



In [37]:

def get_music_features(id_music: str) -> dict:
    
    # 1. Get track data
    track_data = get_track(id_music)

    # 2. Get audio features
    audio_features = get_audio_features(id_music)

    # 3. Merge data
    track_data.update(audio_features)

    # 4. Define relevant features
    relevant_features = [
        'id', 'name', 'explicit', 'duration_ms', 
        'popularity', 'danceability', 'energy',
        'key', 'loudness', 'mode', 'speechiness', 'acousticness',
        'instrumentalness', 'liveness', 'valence', 'tempo',
        'time_signature'
    ]

    track_data = {k: track_data[k] for k in relevant_features}

    return track_data


def get_album_features(id_album: str) -> dict:

    # 1. Get album data
    album_data = get_album(id_album)

    # 2. Define relevant features
    relevant_features = [
        'id', 'genres', 'popularity', 'name', 'release_date', 'total_tracks', 'type'
    ]

    album_data = {k: album_data[k] for k in relevant_features}

    # 3. Add "album_" prefix to each key
    album_data = {f'album_{k}': v for k, v in album_data.items()}

    return album_data


def get_artist_features(id_artist: str) -> dict:
    # pass # get_artist_data

    # 1. Get artist data
    artist_data = get_artist_data(id_artist)

    # 2. Define relevant features
    relevant_features = [
        'id', 'genres', 'popularity', 'name', 'type', 'followers'
    ]

    artist_data = {k: artist_data[k] for k in relevant_features}

    # 3. At artist_followers key, keep only total
    artist_data['followers'] = artist_data['followers']['total']

    # 4. Add "artist_" prefix to each key
    artist_data = {f'artist_{k}': v for k, v in artist_data.items()}

    return artist_data

In [38]:
get_music_features('7sWRlDoTDX8geTR8zzr2vt')

{'id': '7sWRlDoTDX8geTR8zzr2vt',
 'name': "Hollywood's Bleeding",
 'explicit': False,
 'duration_ms': 156267,
 'popularity': 79,
 'danceability': 0.405,
 'energy': 0.646,
 'key': 4,
 'loudness': -3.206,
 'mode': 0,
 'speechiness': 0.0476,
 'acousticness': 0.331,
 'instrumentalness': 0,
 'liveness': 0.103,
 'valence': 0.17,
 'tempo': 130.218,
 'time_signature': 4}

In [39]:
get_album_features('4aawyAB9vmqN3uQ7FjRGTy')

{'album_id': '4aawyAB9vmqN3uQ7FjRGTy',
 'album_genres': [],
 'album_popularity': 58,
 'album_name': 'Global Warming',
 'album_release_date': '2012-11-16',
 'album_total_tracks': 18,
 'album_type': 'album'}

In [40]:
get_artist_features('2wIVse2owClT7go1WT98tk')

{'artist_id': '2wIVse2owClT7go1WT98tk',
 'artist_genres': ['dance pop',
  'hip hop',
  'hip pop',
  'neo soul',
  'pop rap',
  'r&b',
  'rap',
  'urban contemporary',
  'virginia hip hop'],
 'artist_popularity': 72,
 'artist_name': 'Missy Elliott',
 'artist_type': 'artist',
 'artist_followers': 2218538}

In [41]:
df_musicas = df_musicas[['track_uri', 'artist_uri', 'album_uri']]

df_musicas['track_uri'] = df_musicas['track_uri'].str.replace('spotify:track:', '')
df_musicas['artist_uri'] = df_musicas['artist_uri'].str.replace('spotify:artist:', '')
df_musicas['album_uri'] = df_musicas['album_uri'].str.replace('spotify:album:', '')

df_musicas.head(5)

Unnamed: 0,track_uri,artist_uri,album_uri
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K
0,6I9VzXrHxO9rA9A5euc8Ak,26dSoYclwsYLMAKD3tpOr4,0z7pVBGOD7HCIB7S8eLkLI
0,0WqIKmW4BTrj3eJFmnCKMv,6vWDO969PvNqNYHIOW5v0m,25hVFAxTlDvXbx2X2QkUkE
0,1AWQoqb9bSvzTjaLralEkT,31TPClRtHm23RisEBtV3X7,6QPkyl04rXwTGlGlcYaRoW
0,1lzr43nnXAijIGYnCT8M8H,5EvFsr3kj42KNv97ZEnqij,6NmFmPX56pcLBOFMhIiKvF


In [42]:
get_music_features('0UaMYEvWZi0ZqiDOoHU3YI')

{'id': '0UaMYEvWZi0ZqiDOoHU3YI',
 'name': 'Lose Control (feat. Ciara & Fat Man Scoop)',
 'explicit': True,
 'duration_ms': 226864,
 'popularity': 69,
 'danceability': 0.904,
 'energy': 0.813,
 'key': 4,
 'loudness': -7.105,
 'mode': 0,
 'speechiness': 0.121,
 'acousticness': 0.0311,
 'instrumentalness': 0.00697,
 'liveness': 0.0471,
 'valence': 0.81,
 'tempo': 125.461,
 'time_signature': 4}

In [43]:
df_musicas

Unnamed: 0,track_uri,artist_uri,album_uri
0,0UaMYEvWZi0ZqiDOoHU3YI,2wIVse2owClT7go1WT98tk,6vV5UrXcfyQD1wu4Qo2I9K
0,6I9VzXrHxO9rA9A5euc8Ak,26dSoYclwsYLMAKD3tpOr4,0z7pVBGOD7HCIB7S8eLkLI
0,0WqIKmW4BTrj3eJFmnCKMv,6vWDO969PvNqNYHIOW5v0m,25hVFAxTlDvXbx2X2QkUkE
0,1AWQoqb9bSvzTjaLralEkT,31TPClRtHm23RisEBtV3X7,6QPkyl04rXwTGlGlcYaRoW
0,1lzr43nnXAijIGYnCT8M8H,5EvFsr3kj42KNv97ZEnqij,6NmFmPX56pcLBOFMhIiKvF
0,0XUfyU2QviPAs6bxSpXYG4,23zg3TcAtWQy7J6upgbUnj,0vO0b1AvY49CPQyVisJLj0
0,68vgtRHr7iZHpzGpon6Jlo,23zg3TcAtWQy7J6upgbUnj,1RM6MGv6bcl6NrAG8PGoZk
0,3BxWKCI06eQ5Od8TY2JBeA,6wPhSqRtPu1UhRCDX5yaDJ,5x8e8UcCeOgrOzSnDGuPye
0,7H6ev70Weq6DdpZyyTmUXk,1Y8cdNmUJH7yBTd9yOvr5i,283NWqNsCA9GwVHrJk59CG
0,2PpruBYCo4H7WOBJ7Q2EwM,1G9G7WwrXka3Z1r7aIDjI7,1UsmQ3bpJTyK6ygoOOjG1r


In [None]:
df_musicas = df_musicas.sample(5)
df_musicas = df_musicas[['track_uri', 'artist_uri', 'album_uri']]

# Adiciona as features de música
df_musicas = df_musicas.merge(
    df_musicas['track_uri'].apply(get_music_features).apply(pd.Series),
    left_index=True,
    right_index=True
)

# Adiciona as features de álbum
df_musicas = df_musicas.merge(
    df_musicas['album_uri'].apply(get_album_features).apply(pd.Series),
    left_index=True,
    right_index=True
)

# Adiciona as features de artista
df_musicas = df_musicas.merge(
    df_musicas['artist_uri'].apply(get_artist_features).apply(pd.Series),
    left_index=True,
    right_index=True
)


In [None]:
df_musicas

Unnamed: 0,track_uri,artist_uri,album_uri,id,name,explicit,duration_ms,popularity,danceability,energy,...,album_name,album_release_date,album_total_tracks,album_type,artist_id,artist_genres,artist_popularity,artist_name,artist_type,artist_followers
0,1AWQoqb9bSvzTjaLralEkT,31TPClRtHm23RisEBtV3X7,6QPkyl04rXwTGlGlcYaRoW,1AWQoqb9bSvzTjaLralEkT,Rock Your Body,False,267267,79,0.892,0.714,...,Justified,2002-11-04,13,album,31TPClRtHm23RisEBtV3X7,"[dance pop, pop]",80,Justin Timberlake,artist,12910531


In [None]:
df_musicas.to_csv('df_musicas.csv', index=False)