# 03 Rel Artist Track

Put the relationship between a song and the artists they feature the song

In [12]:
import pandas as pd
import sys
import numpy as np
import os
import requests
import datetime
import base64
import json
import re
from collections import Counter
import psycopg2
import tqdm
from bs4 import BeautifulSoup
from importlib import reload
pd.options.mode.chained_assignment = None  # default='warn'
import sys

In [13]:
# We create a utils file with a class to store the basic commands and client ID for the Spotify API
import aux_utils
#reload(aux_utils)

In [14]:
# We have created a list of functions in db_utils.py
import db_utils
#reload(db_utils)

### Spotify API Connection

In [15]:
# Instantiate the class
spotify_utils = aux_utils.spotify_utils
sp_ut = spotify_utils()

# Use the method connect to connect with the credentials
sp = sp_ut.connect()

### Database connection 

In [16]:
conn = psycopg2.connect("dbname=spotify")
cursor= conn.cursor()

In [17]:
# Extrack all the artists
df_artist = db_utils.select_table(cursor, table="master_artist", sel_cols=["artist_id", "is_main"])

# Extrack all the tracks
df_tracks = db_utils.select_table(cursor, table="master_track", sel_cols=["track_id", "streams"])

In [18]:
# Set of artists MAIN
mask = df_artist["is_main"]
df_main_artists = df_artist[mask] #main artists in the kworb dataset
set_artists_main = set(df_main_artists["artist_id"])

# Set of artists OTHER (from our list)
df_other_artists = df_artist[~mask]
set_artists_other = set(df_other_artists["artist_id"])

In [19]:
# Set of tracks MAIN
mask = df_tracks["streams"] > 0
df_tracks_main = df_tracks[mask]
set_tracks_main = set(df_tracks_main["track_id"])

# Set of artists OTHER
df_tracks_other = df_tracks[~mask]
set_tracks_other = set(df_tracks_other["track_id"])

# 1. MAIN Artists-Tracks: featuring artists

In [20]:
def art_url(artist_id):
    return f"https://kworb.net/spotify/artist/{artist_id}.html"

def get_html(url):
    """
    Searches a url and retrieves an html and returns the object soup from bs4
    """
    # Requests the html of this page
    r = requests.get(url)
    r.encoding = 'utf-8'
    
    # Read the html text with Beautiful soup
    soup = BeautifulSoup(r.text, 'html.parser')
    
    return soup

def get_table_artist_tracks(artist_id):
    """
    Given an aritst_id it looks for the kworb web page for that artists and a table of its songs is parsed
    and converted to a dataframe
    """
    
    # Get the html
    soup = get_html(art_url(artist_id))

    # Find the table where there are the artists sorted by streams
    table = soup.find('table')

    # Convert to dataframe
    df = pd.read_html(str(table))[0]
    df['Peak Date'] = df['Peak Date'].apply(lambda x: pd.to_datetime(str(x), format='%Y%m%d'))
    df["Peak Date"] = df["Peak Date"].astype(str) # Convert it to string to push it into PostGRESQL as date
    
    # Impose Track column to be a string
    df["Track"] = df["Track"].astype(str)
    df["Track"] = df["Track"].str.lower()
    return table, df

In [21]:
# Dictionary to store for each artist, each song, the featuring artists
dict_main = {}

In [22]:
for i, row in tqdm.tqdm_notebook(df_main_artists.iterrows()):
    
    # Get the artist ID
    artist_id = row.artist_id
    
    # Create a dictionary for that artist
    dict_main[artist_id] = {}
    
    # Get the <tracks of that artist
    try:
        table, df = get_table_artist_tracks(artist_id)
    except:
        continue
    
    # For every row
    for tr in table.findAll("tr"):
        
       # Get the cells 
        rows_cells = tr.findAll("td")
        
        # Flag if song is in the master_track
        in_master_track = True # as default is true
        
        # For each cell
        for each_cell in rows_cells:
            
            # Check where the hrefs for the track and feat artists appear
            get_all_links = each_cell.findAll("a")
            
            # Loop through the links and differentiate if it is a link of the track id or a link of the artist feat
            for href_element in get_all_links:
                
                #Href link
                link = href_element["href"]
                
                # Text in the link (if the link is for track, it is the track_name, if not, is the artist_name)
                txt = href_element.text.lower()
                txt = txt.replace("'", " ")
                
                # If the link is for a cell to inform the track
                if '/track/' in link: 

                    # Track ID
                    track_id  = link.split("/")[-1].split(".")[0] 
                    
                    # Check if the track_ID is in our database
                    if track_id in set_tracks_main:
                        
                        # Add this track to the dictionary
                        dict_main[artist_id][track_id] = []
                        
                        
                    else: # if the track is NOT in our database
                        
                        in_master_track = False # change the Flag
                        break

                elif link: # there is a link but it is not the track link, hence, it is the feat artist link

                    # If the link is for the feat. artist
                    feat_artist = link.split(".")[0] 
                    
                    # If the artist is in the set_artist:
                    if feat_artist in set_artists_main or feat_artist in set_artists_other:
                        dict_main[artist_id][track_id].append(feat_artist)
            
            # If the flag is False, it means this song is not in our dataset
            if in_master_track is False:
                break

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """Entry point for launching an IPython kernel.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




ImportError: html5lib not found, please install it

In [35]:
# Create the values list
values = []
for artist_id in tqdm.tqdm_notebook(dict_main):
    for track_id in dict_main[artist_id]:
        # Add that artist-track in values
        values.append((artist_id, track_id))
        
        # Add all the featuring artists as artists for that track too
        for feat_art_id in dict_main[artist_id][track_id]:
            values.append((feat_art_id, track_id))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=10010.0), HTML(value='')))




In [37]:
query = cursor.executemany("""insert into REL_ARTIST_TRACK VALUES (%s, %s) ON CONFLICT DO NOTHING; """ , values)
conn.commit()

# 2. OTHER Artists-Tracks: featuring artists

In [49]:
values = []

for i,row in tqdm.tqdm(df_other_artists.iterrows()):
    artist_id = row["artist_id"]
    
    # Query the artist in Spotify
    lz_uri = f'spotify:artist:{artist_id}'
    results = sp.artist_top_tracks(lz_uri)
    
    # Parse the results

    list_tracks = results["tracks"]
    for track in list_tracks:
        
        # Track ID
        track_id = track["id"]
        if track_id in set_tracks_other:
        
            # Get all artists of that song
            for art in track["artists"]:
                artist_id = art["id"]
                if artist_id in set_artists_main or artist_id in set_artists_other:
                    values.append((artist_id, track_id))

25it [00:01, 18.78it/s]


In [58]:
# Avoid duplicates
values = set(values)
values = list(values)

In [62]:
query = cursor.executemany("""insert into REL_ARTIST_TRACK VALUES (%s, %s) ON CONFLICT DO NOTHING; """ , values)
conn.commit()