# Dev 3: Pipeline calling API Spotify

In this notebook there are several things implemented:

- Spotipy credentials to connect to API Spotify
- API call to retrieve an artist
- Check the string similarity with the query


In [2]:
import pandas as pd
import sys
import numpy as np
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data
import requests
import datetime
import base64
import re
from collections import Counter
import psycopg2

### Dataframe

In [3]:
types_cols = {
    "mbid":object,
    "artist_mb":object,
    "artist_lastfm":object,
    "country_mb":object,
    "country_lastfm":object,
    "tags_mb":object,
    "tags_lastfm":object,
    "listeners_lastfm": np.float64,
    "scrobbles_lastfm":np.float64,
    "ambiguous_artist":bool
    }

In [2]:
df = pd.read_csv("../data/artists.csv", dtype=types_cols)

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df.sort_values("listeners_lastfm", ascending=False, inplace=True)

In [4]:
df.head()

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
0,cc197bad-dc9c-440d-a5b5-d52ba2e14234,Coldplay,Coldplay,United Kingdom,United Kingdom,rock; pop; alternative rock; british; uk; brit...,rock; alternative; britpop; alternative rock; ...,5381567.0,360111850.0,False
1,a74b1b7f-71a5-4011-9441-d0b5e4122711,Radiohead,Radiohead,United Kingdom,United Kingdom,rock; electronic; alternative rock; british; g...,alternative; alternative rock; rock; indie; el...,4732528.0,499548797.0,False
2,8bfac288-ccc5-448d-9573-c33ea2aa5c30,Red Hot Chili Peppers,Red Hot Chili Peppers,United States,United States,rock; alternative rock; 80s; 90s; rap; metal; ...,rock; alternative rock; alternative; Funk Rock...,4620835.0,293784041.0,False
3,73e5e69d-3554-40d8-8516-00cb38737a1c,Rihanna,Rihanna,United States,Barbados; United States,pop; dance; hip hop; reggae; contemporary r b;...,pop; rnb; female vocalists; dance; Hip-Hop; Ri...,4558193.0,199248986.0,False
4,b95ce3ff-3d05-4e87-9e01-c97b66af13d4,Eminem,Eminem,United States,United States,turkish; rap; american; hip-hop; hip hop; hiph...,rap; Hip-Hop; Eminem; hip hop; pop; american; ...,4517997.0,199507511.0,False


In [5]:
df_sample = df.loc[1:1000]

In [20]:
df.loc[738]

mbid                             4e045c96-538b-46ed-8ea8-7cae20b56574
artist_mb                                                   The Vines
artist_lastfm                                               The Vines
country_mb                                                  Australia
country_lastfm                                                    NaN
tags_mb             rock; alternative rock; australian; indie rock...
tags_lastfm         rock; indie; indie rock; alternative; alternat...
listeners_lastfm                                               949278
scrobbles_lastfm                                          1.48536e+07
ambiguous_artist                                                False
Name: 738, dtype: object

In [24]:
df[df["artist_mb"].str.lower() == "you me at six"]

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
1281,fb4fc5a6-cbe1-45ff-954c-ef45faceb54d,You Me at Six,You Me at Six,United Kingdom,,,pop punk; rock; seen live; pop rock; powerpop,648189.0,28275653.0,False


In [66]:
df[df["artist_mb"].str.lower() == "rosalía"]

Unnamed: 0,mbid,artist_mb,artist_lastfm,country_mb,country_lastfm,tags_mb,tags_lastfm,listeners_lastfm,scrobbles_lastfm,ambiguous_artist
11882,ac0ee862-a6ca-4d39-a7a4-d8460534ba30,Rosalía,Rosalía,Spain,Spain; United Kingdom,,Flamenco; pop; spanish; spain; Flamenco Nuevo;...,91555.0,3762940.0,False
1461688,25f3abd9-63b5-471a-bd25-feb9672dfa11,ROSALÍA,,Spain,,flamenco; flamenco pop,,,,False


In [25]:
df_sample = df[df["listeners_lastfm"] > 200000]

In [8]:
def get_max_str_column(df_in, col):
    """
    #get_max_str_column(df_sample, "country_mb")
    """
    set_country = set(df_in[col])
    max_l = 0
    element = ""
    for cc in list(set_country):
        if type(cc) == str:
            if len(cc) > max_l:
                max_l = len(cc)
                element= cc
    return max_l, element

In [31]:
tags = df_sample.tags_lastfm.str.split(';').tolist()

In [58]:
all_tags = []
for tt in tags:
    if tt is not np.nan:
        for xx in tt:
            all_tags.append(xx.strip().lower())   

In [59]:
set_tags = set(all_tags)

In [60]:
len(set_tags)

22410

In [61]:
dict_counter = Counter(all_tags)

In [62]:
dict_counter_sort = {k: v for k, v in sorted(dict_counter.items(), key = lambda item: item[1], reverse= True)}

In [63]:
dict_counter_sort

{'seen live': 3923,
 'rock': 3194,
 'alternative': 2858,
 'pop': 2856,
 'indie': 2430,
 'american': 2232,
 'electronic': 2151,
 'male vocalists': 1856,
 'usa': 1790,
 '00s': 1665,
 'alternative rock': 1556,
 'singer-songwriter': 1513,
 'indie rock': 1374,
 'dance': 1341,
 'electronica': 1338,
 'female vocalists': 1309,
 '90s': 1274,
 'chillout': 1217,
 'experimental': 1178,
 'soul': 1097,
 'indie pop': 1063,
 'hip-hop': 1037,
 'folk': 1035,
 'british': 1003,
 '80s': 996,
 'all': 968,
 'classic rock': 955,
 'jazz': 936,
 'pop rock': 892,
 'metal': 892,
 'punk': 885,
 'favorites': 882,
 'acoustic': 868,
 'electro': 841,
 'hip hop': 808,
 'female vocalist': 801,
 'uk': 798,
 'ambient': 785,
 'hard rock': 768,
 'rap': 768,
 'funk': 757,
 'chill': 732,
 'rnb': 712,
 'english': 700,
 'female': 698,
 'instrumental': 664,
 'love': 655,
 'emo': 630,
 'house': 607,
 '70s': 606,
 'psychedelic': 599,
 'blues': 597,
 'new wave': 580,
 'mellow': 575,
 'punk rock': 559,
 'downtempo': 557,
 'oldies': 

### Database

In [100]:
# connect to PostGreSQL
conn = psycopg2.connect("dbname=spotify user=postgres password=pg")
cursor= conn.cursor()

In [112]:
# Check current tables
s = "SELECT table_schema, table_name FROM information_schema.tables where table_schema = 'public'"
cursor.execute(s)
list_tables = cursor.fetchall()

In [113]:
list_tables

[('public', 'artist_song')]

In [114]:
if len(list_tables) == 1:
    
    # MASTER_ARTIST
    name_Table= "public.master_artist"
    sqlCreateTable_MasterArtist = "create table "+name_Table+" (artist_id varchar(25), artist_name varchar(50), listeners INT, CONSTRAINT artist_id_pkey PRIMARY KEY (artist_id));"
    cursor.execute(sqlCreateTable_MasterArtist)
    
    # ARTIST_SONG
    name_Table= "public.artist_song"
    cols = " (artist_id varchar(25), song_name varchar(100), song_id varchar(25), popularity SMALLINT, duration INT , CONSTRAINT artist_song_id_pkey PRIMARY KEY (song_id, artist_id));"
    sqlCreateTable_ArtistSong = "create table " + name_Table + cols
    cursor.execute(sqlCreateTable_ArtistSong)
    
    # COUNTRY_ARTIST
    name_Table= "public.country_artist"
    sqlCreateTable_MasterArtist = "create table "+name_Table+" (artist_id varchar(25), country varchar(50), listeners INT, CONSTRAINT artist_id_pkey PRIMARY KEY (artist_id));"
    cursor.execute(sqlCreateTable_MasterArtist)
    
    #Commit changes
    conn.commit()

In [17]:
def levenshtein(s1, s2):
    """
    Function that implements roughly an approximation of the Levenshtein algorithm
    for string similarity. Is a distance metric, hence, the closer to 0, the most similar
    the strings will be
    """
    if len(s1) < len(s2):
        s1, s2 = s2, s1

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]/float(len(s1))

In [18]:
client_id = "348f94d3a73241188b2a89c91e1cfaee"
client_secret = "b5b29b040ab843cf842cf4eb875caff1"

In [19]:
dict_artists = {}
master_artist = {}

In [20]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API

### Get Artist ID

In [102]:
artist_list_query = ["Sia", "Macklemore", "Avicii"]
dict_artist_metadata = {}

In [103]:
for name_artist_query in artist_list_query:
    
    # Prepare string to search in Spotipy
    name = '{' + name_artist_query + '}'
    
    #Avoid gigant artists names
    if len(name_artist_query) >= 50:
        continue
    
    #search query
    result = sp.search(name) 
    
    # Create a dictionary entrance for that artist
    # Avoid doing an artist twice
    if name_artist_query not in dict_artist_metadata:
        dict_artist_metadata[name_artist_query] = {}
    else:
        continue
        

    #Iterate through all the tracks of that search query
    tracks_artist_query = result["tracks"]["items"]
    found = 0

    for track in tracks_artist_query:
        
        #Get the artist info for the tracks
        various_artists = track["artists"]
        
        # Iterate through a list of artists (a track can have many artists)
        for var_art in various_artists:

            # If the name is very close to the queried one, get the ID and uri and quit loop
            artist_similarity = levenshtein(name_artist_query, var_art["name"])
            if artist_similarity < 0.1:
                found = 1 #it has found a similar artist as written in the query
                break

        # If it has found that artist get the ID and URI
        if found == 1:
            dict_artist_metadata[name_artist_query]["uri"] = var_art["uri"]
            dict_artist_metadata[name_artist_query]["artist_id"]  = var_art["id"]
            dict_artist_metadata[name_artist_query]["name_artist"]  = var_art["name"]
            break #quit the loop

            

In [104]:
dict_artist_metadata

{'Sia': {'uri': 'spotify:artist:5WUlDfRSoLAfcVSX1WnrxN',
  'artist_id': '5WUlDfRSoLAfcVSX1WnrxN',
  'name_artist': 'Sia'},
 'Macklemore': {'uri': 'spotify:artist:3JhNCzhSMTxs9WLGJJxWOY',
  'artist_id': '3JhNCzhSMTxs9WLGJJxWOY',
  'name_artist': 'Macklemore'},
 'Avicii': {'uri': 'spotify:artist:1vCWHaC5f2uS3yhpwWbIA6',
  'artist_id': '1vCWHaC5f2uS3yhpwWbIA6',
  'name_artist': 'Avicii'}}

### Master Table - Artist

In [105]:
master_artist = {}
for k, v in dict_artist_metadata.items():
    master_artist[v["artist_id"]] = v["name_artist"]

In [106]:
def insert_into_master(dict_input):
    query_insert = f"insert into master_artist (artist_id, artist_name) VALUES "
    values_str = ""
    for k,v in dict_input.items():
        values_str = ",".join((values_str, f"('{k}','{v}')"))
    return query_insert + values_str[1:] + " ON CONFLICT DO NOTHING;"

In [107]:
query_master = insert_into_master(master_artist)
cursor.execute(query_master)
conn.commit()

# Top Tracks

In [26]:
top_tracks = sp.artist_top_tracks(uri)

### Top 10 Songs for Artist Table

In [27]:
#Dictionary for each artist
dict_artists[artist_id] = []

# For each track
if "tracks" in top_tracks:
    if len(top_tracks["tracks"]):
        for track in top_tracks["tracks"]:
            
            dict_track = dict()
            # SONG - Properties - Initializer
            dict_track["name"] = ""
            dict_track["duration"]  = 0
            dict_track["popularity"] = 0
            dict_track["song_id"] = ""
            
            # Check the number of artists featuring the song
            # Add to their dictionary the features of the song
            # for them too
            
            # Create a set for the feat. artists
            set_featuring_artists = set()
            
            if "artists" in track:
                
                # Loop through all artists of a song
                for art_dict in track["artists"]:
                    
                    if "name" in art_dict and "id" in art_dict:
                        
                        # Get the id of that artist 
                        id_feat_artist = art_dict["id"]
                        name_feat_artist = art_dict["name"]
                        
                        #If the artist is different from the query one add it
                        # to both the dict_artist and the master_artist
                        if id_feat_artist != artist_id:
                            
                            # Add that feat. artist to the set
                            set_featuring_artists.add(id_feat_artist)
                            
                            # Add that feat. artist to the master of artists
                            master_artist[id_feat_artist] = name_feat_artist
                    
            # SONG - Properties - Track
            # Song name
            if "name" in track:
                dict_track["name"] = track["name"]
            
            # Duration
            if "duration_ms" in track:
                dict_track["duration"]  = track["duration_ms"]
            
            #Popularity
            if "popularity" in track:
                dict_track["popularity"] = track["popularity"]
                
            # Song ID
            if "id" in track:
                dict_track["song_id"] = track["id"]
                
            # Write the dictionary of song properties for the main artist
            dict_artists[artist_id].append(dict_track)
            
            # Write the dictionary of song properties for each of the feat. artists
            list_featuring_artists = list(set_featuring_artists)
            for i_id_feat_artist in list_featuring_artists:
                
                # If that artist is not yet in the dictionary of artists, add it:
                if i_id_feat_artist not in dict_artists:
                    dict_artists[i_id_feat_artist] = []
                dict_artists[i_id_feat_artist].append(dict_track)

In [28]:
dict_artists

{'1Cs0zKBU1kc0i8ypK3B9ai': [{'name': 'Titanium (feat. Sia)',
   'duration': 245040,
   'popularity': 66,
   'song_id': '0lHAMNU8RGiIObScrsRgmP'},
  {'name': 'Instagram',
   'duration': 184558,
   'popularity': 78,
   'song_id': '0U6bQIAh6MCGo1xjbIIx2S'},
  {'name': 'Say My Name',
   'duration': 198946,
   'popularity': 66,
   'song_id': '3MoV1UsAJmz64LHqyiRMp0'},
  {'name': "Better When You're Gone",
   'duration': 192402,
   'popularity': 74,
   'song_id': '4Tvw0lweq9l2JPQKFbpbBQ'},
  {'name': 'Conversations in the Dark - John Legend vs. David Guetta',
   'duration': 210236,
   'popularity': 74,
   'song_id': '20d27F17AZOxTJOEHAVK2Y'},
  {'name': 'Flames',
   'duration': 195000,
   'popularity': 54,
   'song_id': '3tTYxjYYcXFqFXA0AZ1Fav'},
  {'name': 'Sexy Bitch (feat. Akon)',
   'duration': 195853,
   'popularity': 61,
   'song_id': '0uXO2GrNiIb1xHT9LUdxZE'},
  {'name': 'Hey Mama (feat. Nicki Minaj, Bebe Rexha & Afrojack)',
   'duration': 192560,
   'popularity': 61,
   'song_id': '5

In [29]:
master_artist

{'1Cs0zKBU1kc0i8ypK3B9ai': 'David Guetta',
 '73jBynjsVtofjRpdpRAJGk': 'Dimitri Vegas & Like Mike',
 '4VMYDCV2IEDYJArk749S6m': 'Daddy Yankee',
 '3wtMPMvPtiFylbnNXF6CAj': 'Afro Bros',
 '1GDbiv3spRmZ1XdM1jQbT7': 'Natti Natasha',
 '2HkAI0YrEcgoR8QdaURqhO': 'Dimitri Vegas',
 '4pwXiI7Z5ZStkgKowZyoKi': 'Like Mike',
 '64M6ah0SkkRsnPGtGiRAbb': 'Bebe Rexha',
 '1vyhD5VmyZ7KMfW5gqLgo5': 'J Balvin',
 '4mHAu7NX2UNsnGXjviBD9e': 'Brooks',
 '00TKPo9MxwZ0j4ooveIxWZ': 'Loote',
 '5y2Xq6xcjJb2jVM54GHK3t': 'John Legend',
 '5WUlDfRSoLAfcVSX1WnrxN': 'Sia',
 '0z4gvV4rjIZ9wHck67ucSV': 'Akon',
 '4D75GcNG95ebPtNvoNVXhz': 'Afrojack',
 '0hCNtLu0JehylgoiP8L4Gh': 'Nicki Minaj',
 '60d24wfXkVzDSfLS6hyCjZ': 'Martin Garrix',
 '5KKpBU5eC2tJDzf0wmlRp2': 'RAYE'}

## Upload dict_artist and master_artist to Postgresql

In [31]:
a = ("1","2")
''.join(a)

'12'

In [None]:
-- Table: public.artists_1

-- DROP TABLE public.artists_1;

CREATE TABLE public.artists_1
(
    artist_id "char" NOT NULL,
    artist_name "char",
    CONSTRAINT artists_1_pkey PRIMARY KEY (artist_id)
)

TABLESPACE pg_default;

ALTER TABLE public.artists_1
    OWNER to postgres;