<a href="https://colab.research.google.com/github/EitanBakirov/Economics-Data-Science/blob/main/Get_Top_20_Artists_and_Songs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook gets the top 20 artists from a website and then retrieves all their data and top songs from Spotify API

Top 20 Spotify Artists: https://musicpiechart.com/spotify-top-artists <br>
Spotify API: https://developer.spotify.com/documentation/web-api

Imports:

In [1]:
# Importing necessary libraries
import requests
import base64

import requests
from bs4 import BeautifulSoup
import pandas as pd

# Import the files module from google.colab to handle file uploads and downloads
from google.colab import files

import time

Getting all the artists from the website:

In [2]:
def fetch_top_artists(url):
    # Fetch the webpage
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.content, 'html.parser')

    # Get all the artist cards
    artist_cards = soup.find_all('div', class_='flex flex-wrap sm:flex-nowrap gap-4 bg-gray-50 rounded-xl')

    artists = []

    # Extract artist names
    for card in artist_cards:
        artist = card.find('h2', class_='my-0').text.strip()[3:]
        artists.append({"artist": artist})

    # Convert to DataFrame
    artists_df = pd.DataFrame(artists)

    return artists_df

Getting the access token:

In [3]:
def get_spotify_access_token(client_id, client_secret):
    # Spotify API token endpoint
    auth_url = 'https://accounts.spotify.com/api/token'

    # Base64 encode client_id and client_secret for Authorization header
    auth_header = base64.b64encode(f"{client_id}:{client_secret}".encode('ascii')).decode('ascii')

    # Request payload for token endpoint
    payload = {
        'grant_type': 'client_credentials'
    }

    # Headers with Authorization header containing encoded client credentials
    headers = {
        'Authorization': 'Basic ' + auth_header
    }

    try:
        # Make a POST request to get Access Token
        auth_response = requests.post(auth_url, data=payload, headers=headers)

        # Check if the request was successful (status code 200)
        if auth_response.status_code == 200:
            auth_response_data = auth_response.json()
            # Extract Access Token from response
            access_token = auth_response_data['access_token']
            return access_token
        else:
            # Print error message if authentication request failed
            print(f"Authentication error: {auth_response.status_code}, {auth_response.text}")
            return None
    except Exception as e:
        print(f"Exception occurred during token retrieval: {e}")
        return None

In [4]:
# Function to get Spotify artist ID
def get_artist_id(artist_name, access_token):
    url = f"https://api.spotify.com/v1/search?q={artist_name}&type=artist"
    headers = {
        "Authorization": f"Bearer {access_token}"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        if data.get('artists') and data['artists'].get('items'):
            # Assuming we take the first artist found
            artist_id = data['artists']['items'][0]['id']
            return artist_id
        else:
            print(f"No artist found for: {artist_name}")
            return None
    else:
        print(f"Error searching for {artist_name}: {response.status_code}")
        return None

Get their IDs using Spotify API:

In [5]:
def add_artist_ids_to_dataframe(artists_df, access_token):
    # Create an empty list to store artist IDs
    artist_ids = []

    # Loop through each artist name in the DataFrame and fetch Spotify ID
    for index, row in artists_df.iterrows():
        artist_name = row['artist']
        artist_id = get_artist_id(artist_name, access_token)
        artist_ids.append(artist_id)

    # Add the artist IDs to the DataFrame
    artists_df['artist_id'] = artist_ids

    return artists_df

In [6]:
def get_top_tracks(artist_id, access_token):
    # Spotify API endpoint to get an artist's top tracks
    top_tracks_url = f'https://api.spotify.com/v1/artists/{artist_id}/top-tracks'

    # Headers with Authorization Bearer token
    headers = {
        'Authorization': 'Bearer ' + access_token
    }

    # Query parameters for the request
    params = {
        'country': 'US'  # Limit to tracks popular in the US
    }

    # Make the GET request to Spotify API endpoint
    response = requests.get(top_tracks_url, headers=headers, params=params)

    # Check if the request was successful (status code 200)
    if response.status_code == 200:
        top_tracks_data = response.json()['tracks']
        return top_tracks_data
    else:
        print(f"Error: {response.status_code}, {response.text}")
        return None

In [7]:
def fetch_artist_data(top_artist_ids, access_token):
    artists_data = []

    for artist_id in top_artist_ids:
        # Get artist information
        artist_url = f'https://api.spotify.com/v1/artists/{artist_id}'
        headers = {
            'Authorization': 'Bearer ' + access_token
        }
        response = requests.get(artist_url, headers=headers)

        if response.status_code == 200:
            artist_data = response.json()
            # Get top tracks for the artist
            top_tracks_data = get_top_tracks(artist_id, access_token)

            if top_tracks_data:
                # Append artist and tracks data to list
                artists_data.append({
                    'Artist Name': artist_data['name'],
                    'Followers': artist_data['followers']['total'],
                    'Genres': artist_data['genres'],
                    'Popularity': artist_data['popularity'],
                    'Top Tracks': [track['name'] for track in top_tracks_data]
                })

        # Introduce a small delay to avoid rate limiting (2 seconds)
        time.sleep(2)

    # Create DataFrame
    df = pd.DataFrame(artists_data)
    return df

Combine all the functions into a single unified function:

In [8]:
def fetch_all_artist_data(url, client_id, client_secret):
    # Fetch top artists
    artists_df = fetch_top_artists(url)

    # Get Spotify access token
    access_token = get_spotify_access_token(client_id, client_secret)

    if access_token:
        # Add artist IDs to DataFrame
        artists_df = add_artist_ids_to_dataframe(artists_df, access_token)

        # Fetch artist data
        artist_data_df = fetch_artist_data(artists_df['artist_id'], access_token)

        # Sort by 'Popularity' column in descending order
        artist_data_df = artist_data_df.sort_values(by='Popularity', ascending=False)

        return artist_data_df
    else:
        print("Failed to retrieve Access Token.")
        return None

In [9]:
url = 'https://musicpiechart.com/spotify-top-artists'
client_id = '4ef265db85e94f65ba0e415e2965cdad'
client_secret = '95bf634d3d0b4b4f94f62d7e17fb715d'

artist_data_df = fetch_all_artist_data(url, client_id, client_secret)
artist_data_df

Unnamed: 0,Artist Name,Followers,Genres,Popularity,Top Tracks
0,Taylor Swift,113723396,[pop],100,"[Fortnight (feat. Post Malone), Cruel Summer, ..."
1,Billie Eilish,95436354,"[art pop, pop]",94,"[BIRDS OF A FEATHER, LUNCH, CHIHIRO, WILDFLOWE..."
4,Drake,89864323,"[canadian hip hop, canadian pop, hip hop, pop ...",94,"[One Dance, Push Ups, Family Matters, Rich Bab..."
5,The Weeknd,85065701,"[canadian contemporary r&b, canadian pop, pop]",93,"[One Of The Girls (with JENNIE, Lily Rose Depp..."
3,Kendrick Lamar,30094573,"[conscious hip hop, hip hop, rap, west coast rap]",92,"[Not Like Us, Like That, euphoria, All The Sta..."
7,Travis Scott,30134768,"[rap, slap house]",91,"[FE!N (feat. Playboi Carti), Type Shit, gooseb..."
6,Kanye West,25714530,"[chicago rap, hip hop, rap]",90,"[CARNIVAL, Heartless, Flashing Lights, Father ..."
11,Ariana Grande,97112674,[pop],90,"[we can't be friends (wait for your love), the..."
2,Lana Del Rey,36156029,"[art pop, pop]",89,"[Summertime Sadness, Young And Beautiful, Say ..."
19,Olivia Rodrigo,36938336,[pop],87,"[vampire, obsessed, traitor, deja vu, drivers ..."


Downloading the df to our computer as a CSV:

In [None]:
# Save to CSV
csv_file_path = "artists_data.csv"
artist_data_df.to_csv(csv_file_path, index=False)
files.download(csv_file_path)
print(f"The file '{csv_file_path}' has been downloaded.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

The file 'artists_data.csv' has been downloaded.
