LSE Data Science Institute | ME204 (2024) | Final Project

# 🎼 Data Collection via Spotify API

<span style="display: inline-block; padding: 0 10px; font-size: 1.15em;line-height: 1.5em; white-space: nowrap; border: 1px solid #E69F25; border-radius: .5em; color: #fcfcfc; background-color: #E69F25; vertical-align: middle;font-weight: 600 !important;">Data Collection NOTEBOOK</span>

**DATE:** 26 July 2024

**AUTHOR:** [David Cho](https://github.com/LSE-ME204/me204-2024-project-Chodav?tab=readme-ov-file)

-----


# ⚙️ **Setup**

In [43]:
import json
import requests
from lxml import html
import base64
import time
import unicodedata

# 🤐 **1. Setting up (secret) credentials**

In [46]:
credentials_file_path = './credentials.json'

# Open the file and load the data into a variable
with open(credentials_file_path, "r") as f:
    credentials = json.load(f)

## **Getting Tokens**

In [47]:
client_id = credentials['app_client_id']
client_secret = credentials['app_client_secret']

def get_token():
    """
    Get respective API token using client_id and client_secret. Followed Spotify API documentation for setup.
    """

    auth_string = client_id + ":" + client_secret
    auth_bytes = auth_string.encode("utf-8")
    auth_base64 = str(base64.b64encode(auth_bytes), "utf-8")

    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + auth_base64,
        "Content-Type": "application/x-www-form-urlencoded"
    }
    data = {"grant_type": "client_credentials"}
    result = requests.post(url, headers=headers, data=data)
    json_result = json.loads(result.content)
    token = json_result["access_token"]
    
    return token

def get_auth_header(token):
    """
    Get the authorization header for each query.
    """

    return {"Authorization": "Bearer " + token}

BQDL1sXbMi8p_LRcnfClMYF89yB9edbwuGDREgwLDkzy_0Z5aIqF9ciRas_pdSFA_AIhT9p9Tbcuj-aljotApy_2amBs2Yb9ttmgFUYQRv_YmcaR5Hw


# ⛏️ **2. Making Extraction Tools**

### **Get Artist Info**

In [25]:
def search_for_artist(artist_name):
    """
    Find the artist information available from the Spotify API.

    Parameters:
    artist_name (str): The name of the artist.

    Returns:
    json: The artist information from the Spotify API.
    """
    token = get_token()
    url = "https://api.spotify.com/v1/search"
    headers = get_auth_header(token)
    query = f"?q={artist_name}&type=artist,track&limit=1"

    query_url = url + query
    result = requests.get(query_url, headers=headers)
    json_result = json.loads(result.content)["artists"]["items"]

    if len(json_result) == 0:
        print("No artist with this name exists...")
        return None

    return json_result[0]

### **Get Albums from Artist**

In [26]:
def get_albums_by_artist(artist_id):
    """
    Find all the album release dates, ids, and names from an artist.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    json: Collection of release_date, album_id, and album_name from the artist.
    """
    
    token = get_token()
    url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = get_auth_header(token)
    params = {
        "include_groups": "album,single,compilation,appears_on",  # Types of albums to include
        "limit": 50  # Maximum number of items per page
    }
    
    albums = []
    
    while url:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: Received status code {response.status_code}")
        
        result = response.json()
        albums.extend(result['items'])
        
        # Get the next page URL
        url = result.get('next')

        # Sleep to avoid hitting the rate limit
        time.sleep(1)

    # Extract the necessary information from albums
    release_date = [item['release_date'] for item in albums]
    album_id = [item['id'] for item in albums]
    album_name = [item['name'] for item in albums]

    # Remove duplicates
    unique_albums = {name: idx for idx, name in enumerate(album_name)}
    unique_indices = unique_albums.values()

    return {
        "release_date": [release_date[idx] for idx in unique_indices],
        "album_id": [album_id[idx] for idx in unique_indices],
        "album_name": [album_name[idx] for idx in unique_indices]
    }


### **Get Track Info**

In [27]:
def get_track_info(track_id):
    """
    Find the information of a track.

    Parameters:
    track_id (str): The id of the track in question.

    Returns:
    json: track_name, track_id, album_name, album_id, artists, duration_ms, popularity, and number_of_available_markets from the track.
    """

    token = get_token()
    url = f"https://api.spotify.com/v1/tracks/{track_id}"
    headers = get_auth_header(token)
    
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Error: Received status code {response.status_code}")
    
    track = response.json()
    
    track_info = {
        "track_name": track['name'],
        "track_id": track['id'],
        "album_name": track['album']['name'],
        "album_id": track['album']['id'],
        "artists": [artist['name'] for artist in track['artists']],
        "duration_ms": track['duration_ms'],
        "popularity": track['popularity'],
        "number_of_available_markets": len(track['available_markets']),
    }
    
    return track_info


### **Get Track Audio Features**

In [28]:
def get_track_audio_features(track_id):
    """
    Find the audio features of a track.

    Parameters:
    track_id (str): The id of the track in question.

    Returns:
    json: danceability, energy, key, loudness, speechiness, acousticness, instrumentalness, liveness, valence, tempo, id, duration_ms of the track in question.
    """

    token = get_token()
    url = f"https://api.spotify.com/v1/audio-features/{track_id}"
    headers = get_auth_header(token)
    result = requests.get(url, headers=headers)
    features = json.loads(result.content)

    features = {
        "danceability": features['danceability'],
        "energy": features['energy'],
        "key": features['key'],
        "loudness": features['loudness'],
        "speechiness": features['speechiness'],
        "acousticness": features['acousticness'],
        "instrumentalness": features['instrumentalness'],
        "liveness": features['liveness'],
        "valence": features['valence'],
        "tempo": features['tempo'],
        "id": features['id'],
        "duration_ms": features['duration_ms'],
    }
    return features

### **Get All Albums Info from Artist**

In [42]:
def get_albums_by_artist(artist_id):
    """
    Find all albums produced by an artist.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    list: All the json files of the albums by an artist.
    """
    
    token = get_token()
    url = f"https://api.spotify.com/v1/artists/{artist_id}/albums"
    headers = get_auth_header(token)
    params = {
        "include_groups": "album,single,compilation,appears_on", # Types of albums to include
        "limit": 50  # Maximum number of items per page
    }
    
    albums = []
    
    while url:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: Received status code {response.status_code}")
        
        result = response.json()
        albums.extend(result['items'])
        
        # Get the next page URL
        url = result.get('next')

        # Sleep to avoid hitting the rate limit
        time.sleep(1)

    return albums

def get_album_details(album_id):
    """
    Get the details of an album.

    Parameters:
    album_id (str): The id of the album in question.

    Returns:
    json: Details of an album.
    """
    
    token = get_token()
    url = f"https://api.spotify.com/v1/albums/{album_id}"
    headers = get_auth_header(token)
    
    response = requests.get(url, headers=headers)
    album_details = response.json()
    return album_details

def get_all_album_features(artist_id):
    """
    Get all the album features by an artist.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    list: Collection of the json files of all the albums by an artist.
    """

    albums = get_albums_by_artist(artist_id)
    album_features = []

    for album in albums:
        album_id = album['id']
        details = get_album_details(album_id)
        album_features.append(details)
        time.sleep(3)  # Sleep to avoid hitting the rate limit
    
    return album_features

### **Get Audio Features of Tracks from 5 Most Recent Albums**

In [57]:
def get_recent_albums_and_ids(artist_id, num_albums=5):
    """
    Find the information and ids of the most recent albums produced by an artist.

    Parameters:
    artist_id (str): The id of the artist in question.
    num_albums (integer): The number of most recent albums to extract.

    Returns:
    list : Collection of the json files for the most recent albums from an artist.
    list : Collection of the ids for the most recent albums from an artist.
    """

    albums = get_albums_by_artist(artist_id)
    
    # Sort albums by release date in descending order
    albums_sorted = sorted(albums, key=lambda x: x['release_date'], reverse=True)
    
    # Get the top 'num_albums' albums
    recent_albums = albums_sorted[:num_albums]
    
    # Extract the album IDs
    album_ids = [album['id'] for album in recent_albums]
    
    return recent_albums, album_ids

In [56]:
def get_tracks_by_album(album_id):
    """
    Find the track ids from an album.

    Parameters:
    album_id (str): The id of the album in question.

    Returns:
    list : Collection of the track ids from an album.
    """

    token = get_token()
    url = f'https://api.spotify.com/v1/albums/{album_id}/tracks'
    headers = get_auth_header(token)
    params = {
        'limit': 50  # Maximum number of items per page
    }
    
    tracks = []
    
    while url:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            raise Exception(f"Error: Received status code {response.status_code}")
        
        result = response.json()
        tracks.extend(result['items'])
        
        # Get the next page URL
        url = result.get('next')
        
        # Sleep to avoid hitting the rate limit
        time.sleep(1)
    
    # Extract track IDs
    track_ids = [track['id'] for track in tracks]
    
    return track_ids

In [55]:
def get_recent_tracks(artist_id):
    """
    Find the ids of the tracks from an artists most recent albums.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    list : Collection of the track ids from an artist's most recent albums.
    """

    recent_albums, recent_album_ids = get_recent_albums_and_ids(artist_id)

    track_ids = []
    
    for recent_album_id in recent_album_ids:
        track_ids.extend(get_tracks_by_album(recent_album_id))
    
    return track_ids

In [54]:
def get_recent_audio_features(artist_id):
    """
    Get the audio features of the tracks from an artist's most recent albums.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    list : Collection of jsons files of audio features from the recent tracks.
    """

    track_ids = get_recent_tracks(artist_id)
    all_tracks = []

    for track_id in track_ids:
            try:
                track_features = get_track_audio_features(track_id)
                all_tracks.append(track_features)
                time.sleep(1)  # Add delay to avoid hitting rate limits
            except Exception as e:
                print(f"Error retrieving audio features for track {track_id}: {e}")

    return all_tracks

### **Get Track Info from 5 Most Recent Albums**

In [53]:
def get_recent_track_info(artist_id):
    """
    Get the information of the tracks from an artist's most recent albums.

    Parameters:
    artist_id (str): The id of the artist in question.

    Returns:
    list : Collection of jsons files of information from the recent tracks.
    """
    
    track_ids = get_recent_tracks(artist_id)

    all_tracks = []

    for track_id in track_ids:
        try:
            track_info= get_track_info(track_id)
            all_tracks.append(track_info)
            time.sleep(1)  # Add delay to avoid hitting rate limits
        except Exception as e:
            print(f"Error retrieving info for track {track_id}: {e}")

    return all_tracks

# 🔨 **3. Putting it All Together: Data Collection**
The following functions synthesize all the functions written above into scripts that create and store the necessary json files of all the artists, tracks, and albums locally.

In [32]:
def normalize_string(input_string):
    """
    Normalize a string by replacing spaces with underscores, removing accents, and converting it to lowercase.

    Parameters:
    input_string (str): The string to be normalized.

    Returns:
    str: The normalized string.
    """

    # Replace spaces with underscores
    input_string = input_string.replace(' ', '_')
    
    # Normalize the string to NFKD form and encode it to ASCII bytes to remove accents
    normalized_string = unicodedata.normalize('NFKD', input_string).encode('ASCII', 'ignore').decode('utf-8')
    
    # Convert to lowercase
    normalized_string = normalized_string.lower()
    
    return normalized_string

In [51]:
def save_to_json_file(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

def create_artist_albums_json(artist_name):
    artist_id = search_for_artist(artist_name)["id"]
    
    file_path = f'../data/raw/{normalize_string(artist_name)}_albums.json'    
    data = get_all_album_features(artist_id)

    # Save the data to a JSON file
    save_to_json_file(data, file_path)

def create_artist_infos_json(artist_name):
    file_path = f'../data/raw/{normalize_string(artist_name)}_info.json'    
    data = search_for_artist(artist_name)

    # Save the data to a JSON file
    save_to_json_file(data, file_path)

def create_artist_audio_features5_json(artist_name):
    artist_id = search_for_artist(artist_name)["id"]
    file_path = f'../data/raw/{normalize_string(artist_name)}_5audio_features.json'    
    data = get_recent_audio_features(artist_id)

    # Save the data to a JSON file
    save_to_json_file(data, file_path)

def create_artist_track_info5_json(artist_name):
    artist_id = search_for_artist(artist_name)["id"]
    file_path = f'../data/raw/{normalize_string(artist_name)}_5track_info.json'    
    data = get_recent_track_info(artist_id)

    # Save the data to a JSON file
    save_to_json_file(data, file_path)

## **Context**
- Laufey's influences: https://www.classicfm.com/discover-music/laufey-singer-instruments-family-concerts/
    - "Laufey uses a symphony orchestra, solo cello and the instrumentation of the classical world just as often as she incorporates drum brushes and moody electric guitar you’d hear in a **Chet Baker** recording."
    - "Her songwriting, on the other hand, has the relatability and modern-feel of chart-topping hits by pop stars like **Taylor Swift**."
- Wikipedia: Influences and music style
    - "Although Laufey was influenced by classical music, classical artists such as **Frédéric Chopin**,[29] and played it from an early age, she turned to her father's records of female jazz musicians such as **Ella Fitzgerald** and **Billie Holiday** to develop her musical style.[4] She has cited Fitzgerald and **Chet Baker** as her biggest artistic influences, naming the former as her favorite musician. Laufey also cited **Taylor Swift**, **Norah Jones**, and **Adele** as inspirations for her musicianship; she said "[Taylor Swift] has done for pop and country what I hope to do for jazz. She has managed to unite people across the world which is one of my main goals as a musician."
- Laufey style and inspiration description: https://www.musicmetricsvault.com/artists/laufey/7gW0r5CkdEUMm42w9XpyZO
- Interview at Grammy's: https://www.grammy.com/news/icelandic-jazz-singer-laufey-interview-new-album-bewitched
    - "I grew up playing cello; my twin sister plays violin on it as well. There are a lot of classical influences that I dug into — a lot of **Ravel** and **Mendelssohn** and Ravel and **Dvorak**, which is really fun to hide in there. Because if you know, you know — and if you don't, it's just a fun, new treat."
    - "I love **Astrud Gilberto** a lot; there's this [1967] album of hers called Beach Samba that I was really, really inspired by, specifically in "From the Start.""
- Interview with When the Horn Blows: https://whenthehornblows.com/content/2021/4/28/in-conversation-with-laufey
    - "So many!! I’ve been super into **Ravel** and **Scriabin** recently, the colors that exist within their music are just so beautiful. I also love **Shostakovich**, **Rachmaninoff** and **Chopin**. I actually recently wrote a song entirely based off of a Chopin piano piece. I think that I can hear a little bit of jazz in all of these composers."

In [34]:
def get_top_n_artists(url, n = 25):
    """
    Return the top artists of a certain genre on Spotify as listed from musicmetricsvault.com

    Parameters:
    url (str): The url of the page to be scraped.

    Returns:
    list: The top artists of the page.
    """

    # Send an HTTP request to the URL
    response = requests.get(url)
    
    # Parse the response content using lxml
    tree = html.fromstring(response.content)
    
    # Define the XPath to target the elements
    elements_xpath = '//td[position()=2]//div[contains(@class, "ml-4")]//div'
    
    # Extract the relevant elements using XPath
    elements = tree.xpath(elements_xpath)
    
    artists = []

    for element in elements:
        # Extract text nodes and clean up
        text_nodes = element.xpath('.//text()')
        text_content = ' '.join([text.strip() for text in text_nodes if text.strip()])
        
        # Check for adjacent `a` tags and their text
        if not text_content:
            a_tag = element.xpath('.//a[1]')
            if a_tag:
                link_text = a_tag[0].xpath('./text()')
                if link_text:
                    text_content = link_text[0].strip()

        artists.append(text_content)
        
    return artists[:n]

## **The Artists in Question**

In [63]:
# Get top artists in the following genres: Classical, Jazz, Gen z singer-songwriter, Pop, Indie Pop
top_10_classical = get_top_n_artists("https://www.musicmetricsvault.com/genres/classical/213", 10) # Need to Do
top_10_jazz = get_top_n_artists("https://www.musicmetricsvault.com/genres/jazz/339", 10)
top_10_gen_z = get_top_n_artists("https://www.musicmetricsvault.com/genres/gen-z-singer-songwriter/610", 10)
top_10_pop = get_top_n_artists("https://www.musicmetricsvault.com/genres/pop/3", 10)
top_10_indie_pop = get_top_n_artists("https://www.musicmetricsvault.com/genres/indie-pop/217", 10)

In [36]:
# Dvorak, Astrud Gilberto, Scriabin, Shostakovich were not extracted due to Spotify rate and time limitations
laufey_influences = ['Laufey', 'Frédéric Chopin', 'Ella Fitzgerald', 'Billie Holiday', 
                     'Chet Baker', 'Taylor Swift', 'Nora Jones', 'Adele', 'Ravel', 
                     'Mendelssohn', 'Rachmaninoff']

In [37]:
# top_10_pop was not extracted due to Spotify rate and time limitations
artist_lists = [top_10_classical, top_10_jazz, top_10_gen_z, top_10_indie_pop]

artists = []
for artist_list in artist_lists:
    artists.extend(artist_list)

In [38]:
# Create all the json files and store them locally
for artist in artists:
    create_artist_albums_json(artist)
    create_artist_infos_json(artist)       

for laufey_influence in laufey_influences:
    create_artist_audio_features5_json(laufey_influence) 
    create_artist_track_info5_json(laufey_influence)
    create_artist_infos_json(laufey_influence)