In [15]:
import pandas as pd
import sys
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data
import requests
import datetime
import base64
import re
from collections import Counter
import psycopg2
import tqdm
from bs4 import BeautifulSoup

# 1. Dataset of Global Artists

Dataset downloaded from the html page: https://kworb.net/spotify/artists.html

In [110]:
def get_html(url):
    """
    Searches a url and retrieves an html and returns the object soup from bs4
    """
    # Requests the html of this page
    r = requests.get(url)
    r.encoding = 'utf-8'
    
    # Read the html text with Beautiful soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    return soup

In [111]:
def get_dataset():
    """
    Download the dataset from https://kworb.net/spotify/artists.html of global artists in Spotify
    """
    
    # Get the rows
    soup = get_html("https://kworb.net/spotify/artists.html")
    
    # Find the table where there are the artists sorted by streams
    table = soup.find('table')
    
    # Get the rows
    table_rows = table.find_all("tr")
    
    # For each row, append to a list the three columns of that table in a sublist
    l = []
    for tr in tqdm.tqdm(table_rows):
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        if len(row)==3:
            l.append(row)
    
    # Convert the list of list into a dataframe
    df_art = pd.DataFrame(l, columns=["position", "artist", "streams"])
    return df_art

In [112]:
df = get_dataset()

100%|██████████| 10001/10001 [00:00<00:00, 73784.13it/s]


In [113]:
df.head(5)

Unnamed: 0,position,artist,streams
0,1,Drake,19730111322
1,2,Post Malone,14381513844
2,3,Ed Sheeran,14038325763
3,4,J Balvin,13123142801
4,5,Justin Bieber,11891559063


# 2. Top Artists in Spain

Artists that have a famous song among the top 3000 songs of Spain

In [115]:
def get_dataset():
    """
    Download the dataset from https://kworb.net/spotify/artists.html of global artists in Spotify
    """
    
    soup = get_html("https://kworb.net/spotify/country/es_weekly_totals.html")

    # Find the table where there are the artists sorted by streams
    table_d1 = soup.find_all('tr', {"class": "d1"})
    table_d0 = soup.find_all('tr', {"class": "d0"})

    # Append all the results into a single table
    table = []
    for ss in table_d1:
        table.append(ss)
    for ss in table_d0:
        table.append(ss)

    # The first <a href=...> that appears is the name of the artist
    l = []
    for tr in tqdm.tqdm(table):
        txt = tr.find("a").text
        l.append(txt)

    df_art = pd.DataFrame(l, columns=["artist"])

100%|██████████| 3615/3615 [00:00<00:00, 45658.22it/s]
