In [22]:
import requests
import base64
import pandas as pd


CLIENT_ID = 'fa9f3d44584944cf8cd7d988ffe47c6d'
CLIENT_SECRET = '04e5d80274194150b3ee5d9cfbdcec36'

SPOTIFY_TOKEN_URL = "https://accounts.spotify.com/api/token"
SPOTIFY_SEARCH_URL = "https://api.spotify.com/v1/search"


def get_access_token(client_id, client_secret):
    client_creds = f"{client_id}:{client_secret}"
    client_creds_b64 = base64.b64encode(client_creds.encode())

    token_data = {
        'grant_type': 'client_credentials'
    }
    token_headers = {
        'Authorization': f'Basic {client_creds_b64.decode()}'
    }

    r = requests.post(SPOTIFY_TOKEN_URL, data=token_data,
                      headers=token_headers)
    token_response_data = r.json()
    
    return token_response_data.get('access_token')

access_token = get_access_token(CLIENT_ID, CLIENT_SECRET)

#### Fetch Albums

In [23]:
def fetch_albums(access_token, offset=0):
    headers = {'Authorization': f'Bearer {access_token}'}

    search_params = {
        'q': 'year:2022',
        'type': 'album',
        'limit': 50,
        'offset': offset
    }
    
    response = requests.get("https://api.spotify.com/v1/search", headers=headers, params=search_params)
    
    if response.ok:
        results = response.json()['albums']['items']
        return results

    return response.status_code

def get_album_details(access_token, album_ids:list):
    url = f"https://api.spotify.com/v1/albums?ids={','.join(album_ids)}"
    headers = {'Authorization': f'Bearer {access_token}'}
    response = requests.get(url, headers=headers)
    return response.json()


def get_artist_genres(access_token, artist_id):
    url = f"https://api.spotify.com/v1/artists/{artist_id}"
    headers = {'Authorization': f'Bearer {access_token}'}
    response = requests.get(url, headers=headers)
    return response.json().get('genres', [])


def get_album_artists(access_token, album_id):
    url = f"https://api.spotify.com/v1/albums/{album_id}"
    headers = {'Authorization': f'Bearer {access_token}'}
    response = requests.get(url, headers=headers)
    album_data = response.json()
    artist_ids = [artist['id'] for artist in album_data['artists']]
    return artist_ids

#### Fetch Songs

In [31]:
def fetch_songs(access_token, offset):
    headers = {'Authorization': f'Bearer {access_token}'}

    search_params = {
        'q': 'year:2023',
        'type': 'track',
        'limit': 50,
        'offset': offset
    }
    
    response = requests.get("https://api.spotify.com/v1/search", headers=headers, params=search_params)
    
    if response.ok:
        results = response.json()['tracks']['items']
        return results

    return response.status_code


def get_song_features(access_token, track_ids):
    features_url = f"https://api.spotify.com/v1/audio-features?ids={','.join(track_ids)}"
    header = {
        "Authorization": f"Bearer {access_token}"
    }

    res = requests.get(features_url, headers=header)
    return res.json()


def map_key_to_pitch_class(key):
    key_map = {
        -1: "",
        0: "C",
        1: "C#",
        2: "D",
        3: "D#",
        4: "E",
        5: "F",
        6: "F#",
        7: "G",
        8: "G#",
        9: "A",
        10: "A#",
        11: "B"
    }
    return key_map.get(key, "")


def map_mode_to_name(mode):
    mode_map = {
        0: "Minor",
        1: "Major"
    }
    return mode_map.get(mode, "")

In [34]:
import time
import math
import numpy as np

all_new_songs = pd.DataFrame()
offset = 0

while len(all_new_songs) < 2000:
    songs = fetch_songs(access_token, offset=offset)
    song_ids = [song["id"] for song in songs]

    song_details = []
    for chunk in [song_ids[x:x+20] for x in range(0, len(song_ids), 20)]:
        details = get_song_features(access_token, chunk)['audio_features']

        for i, song_id in enumerate(chunk):
            song_info = songs[i]
            detail = details[i]  

            if detail is not None and song_info is not None:
                merged_song_detail = {**song_info, **detail}
                song_details.append(merged_song_detail)

    print(len(all_new_songs) + len(song_details))

    songs_df = pd.DataFrame([{
        'track_name': song.get('name', ''),
        'artist(s)_name': ", ".join([artist['name'] for artist in song['artists']]),
        'artist_count': len(song.get('artists')),
        'album': song.get('album', {}).get('name', ""),
        'explicit': song.get('explicit', ''),
        'popularity': song.get('popularity', np.NaN),
        'release_date': song.get('album', {}).get('release_date', ""),
        'streams': np.NaN,
        'duration_in_min': song.get('duration_ms', np.NaN) / 60000,
        'bpm': math.floor(song.get('tempo', np.NaN)),
        'key': map_key_to_pitch_class(song.get('key', -1)),
        'mode': map_mode_to_name(song.get('mode', np.NaN)),
        'danceability_%': math.ceil(song.get('danceability', np.NaN) * 100),
        'valence_%': math.ceil(song.get('valence', np.NaN) * 100),
        'energy_%': math.ceil(song.get('energy', np.NaN) * 100),
        'acousticness_%': math.ceil(song.get('acousticness', np.NaN) * 100),
        'instrumentalness_%': math.ceil(song.get('instrumentalness', np.NaN) * 100),
        'liveness_%': math.ceil(song.get('liveness', np.NaN) * 100),
        'speechiness_%': math.ceil(song.get('speechiness', np.NaN) * 100),
    } for song in song_details])

    all_new_songs = pd.concat(
        [all_new_songs, songs_df], ignore_index=True)

    time.sleep(1)
    offset += 50

50
100


KeyboardInterrupt: 

In [38]:
all_new_songs.head(n=10)

Unnamed: 0,track_name,artist(s)_name,artist_count,album,explicit,popularity,release_date,streams,duration_in_min,bpm,key,mode,danceability_%,valence_%,energy_%,acousticness_%,instrumentalness_%,liveness_%,speechiness_%
0,I Remember Everything (feat. Kacey Musgraves),"Zach Bryan, Kacey Musgraves",2,Zach Bryan,False,93,2023-08-25,,3.7866,77,C,Major,43,16,46,56,1,11,5
1,My Love Mine All Mine,Mitski,1,The Land Is Inhospitable and So Are We,False,97,2023-09-15,,2.296217,113,A,Major,51,13,31,87,14,16,4
2,IDGAF (feat. Yeat),"Drake, Yeat",2,For All The Dogs,True,94,2023-10-06,,4.335183,136,G#,Major,67,14,67,5,1,21,28
3,fukumean,Gunna,1,a Gift & a Curse,True,94,2023-06-16,,2.084,130,C#,Minor,85,22,63,12,0,29,10
4,greedy,Tate McRae,1,greedy,True,100,2023-09-15,,2.197867,111,F#,Minor,75,85,74,26,0,12,4
5,MONACO,Bad Bunny,1,nadie sabe lo que va a pasar mañana,True,98,2023-10-13,,4.453233,139,E,Minor,79,13,63,15,1,58,7
6,Paint The Town Red,Doja Cat,1,Paint The Town Red,True,97,2023-08-04,,3.8625,99,F,Major,87,74,54,27,1,10,18
7,Is It Over Now? (Taylor's Version) (From The V...,Taylor Swift,1,1989 (Taylor's Version),False,94,2023-10-26,,3.824633,100,C,Major,60,18,66,6,0,13,4
8,First Person Shooter (feat. J. Cole),"Drake, J. Cole",2,For All The Dogs,True,92,2023-10-06,,4.124067,163,D,Major,47,25,64,3,0,39,32
9,I KNOW ?,Travis Scott,1,UTOPIA,True,92,2023-07-28,,3.526367,117,F,Minor,93,82,62,2,0,11,6


#### Get Data

In [40]:
import time

import numpy as np

all_new_albums = pd.DataFrame()
offset = 0

while len(all_new_albums) < 5000:
    new_albums = fetch_albums(access_token, offset=offset)

    new_album_ids = [album["id"] for album in new_albums]

    new_album_details = []
    for chunk in [new_album_ids[x:x+20] for x in range(0, len(new_album_ids), 20)]:
        album_details = get_album_details(access_token, chunk)['albums'] 
        
        for album in album_details:
            if album["album_type"] == "album":
                artist_ids = get_album_artists(access_token, album["id"])
                genres = []
                
                for artist_id in artist_ids:
                    genres += get_artist_genres(access_token, artist_id)
                    
                album["genres"] = list(set(genres))
                new_album_details.append(album)

    print(len(all_new_albums) + len(new_album_details))

    new_albums_df = pd.DataFrame([{
        'album_name': album['name'],
        'artist_name': ", ".join([artist["name"] for artist in album['artists']]),
        'release_date': album['release_date'],
        'genres': ", ".join(album['genres']),
        'descriptors': "",
        'avg_rating': np.NaN,
        'rating_count': np.NaN,
        'review_count': np.NaN,
        'popularity': album['popularity'],
        'total_tracks': album['total_tracks'],
    } for album in new_album_details])

    all_new_albums = pd.concat(
        [all_new_albums, new_albums_df], ignore_index=True)
    
    time.sleep(1)
    offset += 50

46


KeyboardInterrupt: 

In [41]:
all_new_albums.head(n=50)

Unnamed: 0,album_name,artist_name,release_date,genres,descriptors,avg_rating,rating_count,review_count,popularity,total_tracks
0,SOS,SZA,2022-12-09,"rap, pop, r&b",,,,,90,23
1,Midnights,Taylor Swift,2022-10-21,pop,,,,,91,13
2,HEROES & VILLAINS,Metro Boomin,2022-12-02,rap,,,,,90,15
3,Un Verano Sin Ti,Bad Bunny,2022-05-06,"urbano latino, reggaeton, trap latino",,,,,93,23
4,Her Loss,"Drake, 21 Savage",2022-11-04,"hip hop, rap, pop rap, canadian pop, atl hip h...",,,,,85,16
5,American Heartbreak,Zach Bryan,2022-05-20,"classic oklahoma country, modern country pop",,,,,82,34
6,Stick Season,Noah Kahan,2022-10-14,pov: indie,,,,,84,14
7,Harry's House,Harry Styles,2022-05-20,pop,,,,,89,13
8,WASTELAND,Brent Faiyaz,2022-07-08,"rap, r&b",,,,,82,19
9,Midnights (3am Edition),Taylor Swift,2022-10-22,pop,,,,,82,20
