# Dataset Creation

For the sake of notation, Main_Artists are the ones found in the kworb website, whereas Other_Artists are other artists that appear on Spotify but that do not appear in that webiste.

In [11]:
import pandas as pd
import sys
import numpy as np
import os
import requests
import datetime
import base64
import re
from collections import Counter
import psycopg2
import tqdm
from bs4 import BeautifulSoup

In [12]:
# We create a utils file with a class to store the basic commands and client ID for the Spotify API
from aux_utils import spotify_utils

#### Spotify API connection

In [27]:
# Instantiate the class
sp_ut = spotify_utils()

# Use the method connect to connect with the credentials
sp = sp_ut.connect()

In [28]:
sp.search("Ramin Djawadi")

{'tracks': {'href': 'https://api.spotify.com/v1/search?query=Ramin+Djawadi&type=track&offset=0&limit=10',
  'items': [{'album': {'album_type': 'album',
     'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/1hCkSJcXREhrodeIHQdav8'},
       'href': 'https://api.spotify.com/v1/artists/1hCkSJcXREhrodeIHQdav8',
       'id': '1hCkSJcXREhrodeIHQdav8',
       'name': 'Ramin Djawadi',
       'type': 'artist',
       'uri': 'spotify:artist:1hCkSJcXREhrodeIHQdav8'}],
     'available_markets': ['CA', 'US'],
     'external_urls': {'spotify': 'https://open.spotify.com/album/2Ia3TzAjzWw0Tdspy9Fwgg'},
     'href': 'https://api.spotify.com/v1/albums/2Ia3TzAjzWw0Tdspy9Fwgg',
     'id': '2Ia3TzAjzWw0Tdspy9Fwgg',
     'images': [{'height': 640,
       'url': 'https://i.scdn.co/image/ab67616d0000b273e4f028aed6760c0c7f1eb0ca',
       'width': 640},
      {'height': 300,
       'url': 'https://i.scdn.co/image/ab67616d00001e02e4f028aed6760c0c7f1eb0ca',
       'width': 300},
      {'h

In [8]:
sp.artist_related_artists("6XYRpcgPIK9OejoVzA7PbC")

{'artists': [{'external_urls': {'spotify': 'https://open.spotify.com/artist/3eAm5IYwnH7uTX5EBT9sbi'},
   'followers': {'href': None, 'total': 19885},
   'genres': ['cantautor',
    'catalan folk',
    'indie catala',
    'rock catala',
    'rumba',
    'rumba catalana'],
   'href': 'https://api.spotify.com/v1/artists/3eAm5IYwnH7uTX5EBT9sbi',
   'id': '3eAm5IYwnH7uTX5EBT9sbi',
   'images': [{'height': 640,
     'url': 'https://i.scdn.co/image/eca25266e9888a095df408c60d35029c5fac083e',
     'width': 640},
    {'height': 320,
     'url': 'https://i.scdn.co/image/08102a8884ff955a7488a9d7a5cbc084d65dfd7b',
     'width': 320},
    {'height': 160,
     'url': 'https://i.scdn.co/image/3cb51dd489be53c7ef41605b72e4ecf66d28d722',
     'width': 160}],
   'name': 'Gertrudis',
   'popularity': 47,
   'type': 'artist',
   'uri': 'spotify:artist:3eAm5IYwnH7uTX5EBT9sbi'},
  {'external_urls': {'spotify': 'https://open.spotify.com/artist/0OVfimlbekXaMrP8IoCzHJ'},
   'followers': {'href': None, 'total': 5

# 1. Dataset of Global Artists (Main Artists)

Dataset downloaded from the html page: https://kworb.net/spotify/artists.html

In [15]:
def get_html(url):
    """
    Searches a url and retrieves an html and returns the object soup from bs4
    """
    # Requests the html of this page
    r = requests.get(url)
    r.encoding = 'utf-8'
    
    # Read the html text with Beautiful soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    return soup

In [16]:
def get_dataset():
    """
    Download the dataset from https://kworb.net/spotify/artists.html of global artists in Spotify
    """
    
    # Get the rows
    soup = get_html("https://kworb.net/spotify/artists.html")
    
    # Find the table where there are the artists sorted by streams
    table = soup.find('table')
    
    # Get the rows
    table_rows = table.find_all("tr")
    
    # For each row, append to a list the three columns of that table in a sublist
    l = []
    for tr in tqdm.tqdm(table_rows):
        td = tr.find_all('td')
        row = [tr.text for tr in td]
        if len(row)==3:
            l.append(row)
    
    # Convert the list of list into a dataframe
    df_art = pd.DataFrame(l, columns=["position", "artist", "streams"])
    return df_art

In [17]:
df = get_dataset()

100%|██████████| 10001/10001 [00:00<00:00, 69219.63it/s]


In [18]:
df.head(5)

Unnamed: 0,position,artist,streams
0,1,Drake,19730111322
1,2,Post Malone,14381513844
2,3,Ed Sheeran,14038325763
3,4,J Balvin,13123142801
4,5,Justin Bieber,11891559063


In [19]:
# Final list of artists
final_list_artists = df.artist.str.lower().tolist()

In [20]:
# Otras: otras canciones que no aparecen en la pagina kworb:
other_list_artists = []

# 2. Top Artists in Spain (Main Artists)

Artists that have a famous song among the top 3000 songs of Spain

In [21]:
def get_dataset_spain():
    """
    Download the dataset from https://kworb.net/spotify/artists.html of global artists in Spotify
    """
    
    soup = get_html("https://kworb.net/spotify/country/es_weekly_totals.html")

    # Find the table where there are the artists sorted by streams
    table_d1 = soup.find_all('tr', {"class": "d1"})
    table_d0 = soup.find_all('tr', {"class": "d0"})

    # Append all the results into a single table
    table = []
    for ss in table_d1:
        table.append(ss)
    for ss in table_d0:
        table.append(ss)

    # The first <a href=...> that appears is the name of the artist
    l = []
    for tr in tqdm.tqdm(table):
        txt = tr.find("a").text
        l.append(txt)

    df_art = pd.DataFrame(l, columns=["artist"])
    return df_art
    

In [22]:
df_spain = get_dataset_spain()

100%|██████████| 3615/3615 [00:00<00:00, 46775.47it/s]


We will make the intersection of artists from the Global charts and the Top Spanish charts in order to avoid replicating the same artist twice. We will add those artists that are in Spanish list but are not in Global to the list of artists.

In [23]:
# Avoid adding repeated artists
set_global = set(df.artist.str.lower())
set_spain = set(df_spain.artist.str.lower())
inters = set_global & set_spain
except_art = list(set_spain - set_global)

In [24]:
len(except_art)

120

In [25]:
# Final list of artists
for art in except_art:
    final_list_artists.append(art)

# 3. Also known Spanish artists (Other artists)

Songs that we not in the kworb.net, will be investigated with the Spotify API. So we will store them separately:

In [26]:
# Otras Españolas
lista_otras_esp = ["Mägo de Oz", "La Casa Azul", "Amaral", "M-Clan"]
lista_otras_esp = [xx.lower() for xx in lista_otras_esp]

# Final list of artists
for art in lista_otras_esp:
    other_list_artists.append(art)

## 3.1 Top Artists in Catalonia (Other Artists) 

These artists do not appear in the kworb.net so we will get them via Spotify API:

In [177]:
lista_cat = [
    "Txarango",
    "Oques Grasses",
    "La Pegatina",
    "Buhos",
    "Doctor Prats",
    "Els Catarres",
    "Els Amics De Les Arts",
    "Gertrudis",
    "Gossos",
    "Sopa de Cabra",
    "Lax'n'Busto",
    "Vuit",
    "Suu",
    "Teràpia de Shock",
    "Els Pets",
    "Manel",
    "Blaumut",
    "Lluis Llach",
    "Sau",
    "Obrint Pas",
    "Clara Olóndriz",
    "Stay Homas",
    "Joan Dausà",
    "Joan Manuel Serrat",
    "Cesk Freixas",
    "Roba Estesa"
    
]

lista_cat = [xx.lower() for xx in lista_cat]

In [183]:
# Final list of artists
for art in lista_cat:
    other_list_artists.append(art)

# 4. Spotify API

We will search for the artist_id and the tracks of the *other_list_artists* which were artists popular but not found in the dataset that we will exploit:

In [184]:
other_list_artists

['mägo de oz',
 'la casa azul',
 'amaral',
 'm-clan',
 'txarango',
 'oques grasses',
 'la pegatina',
 'buhos',
 'doctor prats',
 'els catarres',
 'els amics de les arts',
 'gertrudis',
 'gossos',
 'sopa de cabra',
 "lax'n'busto",
 'vuit',
 'suu',
 'teràpia de shock',
 'els pets',
 'manel',
 'blaumut',
 'lluis llach',
 'sau',
 'obrint pas',
 'clara olóndriz',
 'stay homas',
 'joan dausà',
 'joan manuel serrat',
 'cesk freixas',
 'roba estesa']

In [180]:
len(other_list_artists)

8