# Master Artist Creation

For the sake of notation, Main_Artists are the ones found in the kworb website, whereas Other_Artists are other artists that appear on Spotify but that do not appear in that webiste.

In [368]:
import pandas as pd
import sys
import numpy as np
import os
import requests
import datetime
import base64
import json
import re
from collections import Counter
import psycopg2
import tqdm
from bs4 import BeautifulSoup
from importlib import reload

In [369]:
# We create a utils file with a class to store the basic commands and client ID for the Spotify API
import aux_utils
#reload(aux_utils)

In [370]:
# We have created a list of functions in db_utils.py
import db_utils
#reload(db_utils)

#### Spotify API connection

In [371]:
# Instantiate the class
spotify_utils = aux_utils.spotify_utils
sp_ut = spotify_utils()

# Use the method connect to connect with the credentials
sp = sp_ut.connect()

# 1. Dataset of Global Artists (Main Artists)

Dataset downloaded from the html page: https://kworb.net/spotify/artists.html

In [10]:
URL_GLOBAL = "https://kworb.net/spotify/artists.html"

In [9]:
def get_html(url):
    """
    Searches a url and retrieves an html and returns the object soup from bs4
    """
    # Requests the html of this page
    r = requests.get(url)
    r.encoding = 'utf-8'
    
    # Read the html text with Beautiful soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    return soup

In [42]:
# Get the rows
soup = get_html("https://kworb.net/spotify/artists.html")

# Find the table where there are the artists sorted by streams
table = soup.find('table')

# Get the rows
table_rows = table.find_all("tr")

# For each row, append to a list the three columns of that table in a sublist
list_of_artists = []

for tr in tqdm.tqdm(table_rows):
    
    # Parse HTML row
    td = tr.find_all('td') 
    row = [tr.text for tr in td] # get the row 
    
    
    if len(row)==3: # if it is not the header
        
        # Get artists name
        a_name = row[1] # second column of the table
        
        # Get artist artist_id
        a_id = tr.find("a").get("href").split("/")[1].split(".")[0] # split artist/<ID>.html
        
        # Append to a list of list 
        list_of_artists.append([a_id, a_name])

100%|██████████| 10001/10001 [00:00<00:00, 29841.26it/s]


In [43]:
df_global_artist = pd.DataFrame(list_of_artists, columns=["artist_id", "artist_name"])

In [44]:
df_global_artist.head(5)

Unnamed: 0,artist_id,artist_name
0,3TVXtAsR1Inumwj472S9r4,Drake
1,246dkjvS1zLTtiykXe5h60,Post Malone
2,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran
3,1vyhD5VmyZ7KMfW5gqLgo5,J Balvin
4,1uNFoZAHBGtllmzznpCI3s,Justin Bieber


# 2. Top Artists in Spain (Main Artists)

Artists that have a famous song among the top 3000 songs of Spain

In [68]:
URL_SPAIN = "https://kworb.net/spotify/country/es_weekly_totals.html"

soup = get_html(URL_SPAIN)

# Find the table where there are the artists sorted by streams
table_d1 = soup.find_all('tr', {"class": "d1"})
table_d0 = soup.find_all('tr', {"class": "d0"})

# Append all the results into a single table
table = []
for ss in table_d1:
    table.append(ss)
for ss in table_d0:
    table.append(ss)

# The first <a href=...> that appears is the name of the artist
list_spanish_artists = []

In [69]:
for tr in tqdm.tqdm(table):
    
    # Name 
    get_link = tr.find("a")
    a_name = get_link.text
    
    # Artist ID 
    a_id = get_link.get("href").split("/")[2].split(".")[0] # split artist/<ID>.html
    
    # Append to the list
    list_spanish_artists.append([a_id, a_name])

100%|██████████| 3615/3615 [00:00<00:00, 42499.00it/s]


In [70]:
df_spanish_artists = pd.DataFrame(list_spanish_artists, columns=["artist_id", "artist_name"])

In [357]:
df_spanish_artists.head()

Unnamed: 0,artist_id,artist_name
0,5H1nN1SzW0qNeUEZvuXjAj,Danny Ocean
1,6vPXtLmNd3mW0dd1Rb9eQ9,Don Patricio
2,2R21vXR83lH98kGeO99Y66,Anuel Aa
3,4VMYDCV2IEDYJArk749S6m,Daddy Yankee
4,790FomKkXshlbRYZFtlgla,Karol G


We will make the intersection of artists from the Global charts and the Top Spanish charts in order to avoid replicating the same artist twice. We will add those artists that are in Spanish list but are not in Global to the list of artists.

In [75]:
# Merge both dataframes and keep only the unique artist_id
frames = [df_global_artist, df_spanish_artists]
df_main = pd.concat(frames)
df_main.drop_duplicates(inplace=True)

In [359]:
df_main.shape

(10121, 3)

In [77]:
# Check that all artist_id are uniques
len(set(df_main.artist_id))

10121

# 3. Also known artists (Other artists)

In [78]:
other_list_artists = []

## 3.1 Spanish Artists

Songs that we not in the kworb.net, will be investigated with the Spotify API. So we will store them separately:

In [79]:
# Otras Españolas
lista_otras_esp = ["Mägo de Oz", "La Casa Azul", "Amaral", "M-Clan"]
lista_otras_esp = [xx.lower() for xx in lista_otras_esp]

# Final list of artists
for art in lista_otras_esp:
    other_list_artists.append(art)

## 3.2 Catalan Artists

These artists do not appear in the kworb.net so we will get them via Spotify API:

In [308]:
lista_cat = [
    "Oques Grasses",
    "La Pegatina",
    "Buhos",
    "Doctor Prats",
    "Els Catarres",
    "Els Amics De Les Arts",
    "Gertrudis",
    "Gossos",
    "Sopa de Cabra",
    "Lax'n'Busto",
    "Vuit",
    "Suu",
    "Teràpia de Shock",
    "Els Pets",
    "Manel",
    "Blaumut",
    "Lluis Llach",
    "Sau",
    "Obrint Pas",
    "Clara Olóndriz",
    "Stay Homas",
    "Joan Dausà",
    "Joan Manuel Serrat",
    "Cesk Freixas",
    "Roba Estesa"
    
]

lista_cat = [xx.lower() for xx in lista_cat]

In [309]:
# Final list of artists
for art in lista_cat:
    other_list_artists.append(art)

# 4. Spotify API

We will search for the artist_id and the tracks of the *other_list_artists* which were artists popular but not found in the dataset that we will exploit:

In [310]:
other_list_artists_found = []

# Do a for loop to search these artists in Spotify
for oth_art in tqdm.tqdm_notebook(other_list_artists):

    # Do the query to Spotify API
    dict_art = sp.search(oth_art)
    
    # Get all the artists that the Spotify API retrieves as answer to the query
    artists_found = []
    artist_id_found = []
    if dict_art.get("tracks"):
        tracks = dict_art["tracks"]
        if tracks.get("items"):
            items = tracks["items"]
            for item in items:
                if item.get("artists"):
                    artists = item["artists"]
                    for art in artists:
                        if art.get("name") and art.get("id"):
                            artists_found.append(art["name"])
                            artist_id_found.append(art["id"])
                            
    # For each artists, compare if the name is the same or very similar to the queried one: oth_art
    for art_found, art_id_found in zip(artists_found, artist_id_found):
        
        # String similarity using Levenshtein distance between the artist found in the API and the queried artist
        string_similarity = sp_ut.levenshtein(art_found.lower(), oth_art.lower())
        
        # Check if the found artist is written very similar
        if string_similarity < 0.1:
            
            # Append the matching artist and id to the list and stop searching for more artists
            other_list_artists_found.append([art_id_found, art_found])
            break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=55.0), HTML(value='')))




In [311]:
# Creates a dataframe to store the artists found
df_other = pd.DataFrame(other_list_artists_found, columns = ["artist_id", "artist_name"])

In [312]:
df_other.head()

Unnamed: 0,artist_id,artist_name
0,5ZNxiPcbKgaNcBrERMpqeu,Mägo de Oz
1,2Ynst7DZrEJnlaMM41ZCxd,La Casa Azul
2,4OkeTQCk0fvX6VBYpOOxDi,Amaral
3,4oVqMVHC03xnYJ7fyb1dh6,M-Clan
4,6XYRpcgPIK9OejoVzA7PbC,Txarango


# 5. Join all dataframes

Join the df_main and the df_other adding the column "is_main" to indicate if the artists has been retrieved from the kworb dataset (is_main = 1) or the Spotify API search (is_main = 0):

In [364]:
df_main["is_main"] = 1
df_other["is_main"] = 0

frames = [df_main, df_other]

df = pd.concat(frames)
df.drop_duplicates(inplace=True)

In [365]:
df.shape

(10147, 3)

# 6. Upload to Database: Spotify (PostgreSQL)

In the terminal, put "psql_spotify" to access PostgreSQL

### Get the maximum string size

For each column, see the maximum size to guide the creation of the table in PostgreSQL

In [321]:
# connect to PostGreSQL
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [322]:
# Prepare values of the query
values = []
for i, row in df.iterrows():

    # The boolean type in Postgresql can be a string of 0 or 1
    boolean = '1'
    if row["is_main"] == 0:
        boolean = '0'
        
    # Get the strings of the two other fields
    artist_id = row["artist_id"]
    artist_name = row["artist_name"]
    
    # Avoid problems with single quotes
    artist_name = artist_name.replace("'", " ")
    
    # Append to the list of tuples: values
    values.append((artist_id, artist_name, boolean))

In [323]:
query = cursor.executemany("""insert into master_artist VALUES (%s, %s, %s) ON CONFLICT DO NOTHING; """ , values)
conn.commit()

## 6.2 Check contents

In [361]:
ss = db_utils.select_table(cursor, direct_query = """ SELECT COUNT(*) FROM master_artist""")

In [362]:
ss

Unnamed: 0,0
0,10146


## Tests 

In [374]:
sp_ut.save_as_json(sp.search("Ramin Djawadi"), "../data/RaminDjawadi.json")

In [375]:
sp_ut.save_as_json(sp.artist_related_artists("6XYRpcgPIK9OejoVzA7PbC"), "../data/SimilarArtists.json")