# Navigating and Scraping Spotify with Selenium for Total Monthly Listeners 

Import the packages required and set up the chromdriver

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import pickle
import numpy as np
import pandas as pd

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

First, I get a list of the artists from the top 1050 collected records dataframe I pickled

In [None]:
os.chdir('/Users/beth/Documents/Metis/metis_project_2/metis_project2')

In [3]:
with open('top_1050_collected_records.pickle', 'rb') as open_file:
    top_1050_collected_records = pickle.load(open_file)

A copy of the original artists name to match back to the top 1050 collected records dataframe later

In [None]:
top_artists_orig_names=list(top_1050_collected_records['artists'].unique())

A copy to edit in case I need to for spotify

In [None]:
top_artists=list(top_1050_collected_records['artists'].unique())

There are some obvious issues with the artist names, some have parentheses with numbers after them. 

In [None]:
top_artists=list(map(lambda x: re.sub(' \([0-9]*\)','', x),top_artists))

Create a spotify monthly listeners list to populate

In [None]:
spotify_monthly_listeners=[]

The below will loop over all of the artists in the top_artists list, use Selenium to search for them on Spotify, and click the link to thier Spotify page, then add thier monthly listeners to the spotify_monthly_listeners list. 

There were issues however because of MANY artist names in discogs being slightly different than in Spotify. This is not an issue for the search step, but for the link clicking step. Selenium finds elements based on class, id, or link text, but the artist links on Spotify's search page had no class or id. Partial link text match offerered by Selenium was not useful in this case, as it led to more incorrect song or album links being clicked.

There were other issues when artists had self-titled albums, with Selenium opening the album link instead of the artist link. 

I was not sure how to avoid these issues, and am looking into it, but in the mean time I just updated artist names in the artist list and restarted the loop at it's last position every time Selenium stalled for not finding a link. 

In [2]:
driver.get('https://open.spotify.com/search/')

In [3]:
search_bar = driver.find_element_by_xpath("//input[@data-testid='search-input']")

In [4]:
search_bar.clear()
search_bar.send_keys("Queen")

In [5]:
soup = BeautifulSoup(driver.page_source, 'html.parser')

In [10]:
for link in soup.find_all('a'):
    if 'artist' in link.get('href') and 'Queen' in link.text:
        print(link)

<a class="f7ebc3d96230ee12a84a9b0b4b81bb8f-scss" dir="auto" draggable="false" href="/artist/1dfeR4HaWDbWqFHLkxsg1d" title="Queen"><div as="div" class="_45331a50e3963ecc26575a06f1fd5292-scss _3957b7dd066dbbba6a311b40a427c59f-scss">Queen</div></a>
<a class="f7ebc3d96230ee12a84a9b0b4b81bb8f-scss" dir="auto" draggable="false" href="/artist/3nViOFa3kZW8OMSNOzwr98" title="Queen Naija"><div as="div" class="_45331a50e3963ecc26575a06f1fd5292-scss _3957b7dd066dbbba6a311b40a427c59f-scss">Queen Naija</div></a>
<a class="f7ebc3d96230ee12a84a9b0b4b81bb8f-scss" dir="auto" draggable="false" href="/artist/4pejUc4iciQfgdX6OKulQn" title="Queens of the Stone Age"><div as="div" class="_45331a50e3963ecc26575a06f1fd5292-scss _3957b7dd066dbbba6a311b40a427c59f-scss">Queens of the Stone Age</div></a>
<a class="f7ebc3d96230ee12a84a9b0b4b81bb8f-scss" dir="auto" draggable="false" href="/artist/65TI81cGOUJx4OXM4VHTND" title="Queendome Come"><div as="div" class="_45331a50e3963ecc26575a06f1fd5292-scss _3957b7dd066dbb

In [None]:
search_bar.clear()
search_bar.send_keys("")

In [None]:
#open the spotify search page using chromedriver
driver.get('https://open.spotify.com/search/')

#loop over all the top_artists
for i in top_artists:
    
    time.sleep(2)
    
    #locate the Spotify search bar 
    search_bar = driver.find_element_by_xpath("//input[@data-testid='search-input']")
    
    #clear the default text in the search bar
    search_bar.clear()
    
    #input the artist name in the search bar, the page updates automatically as you type
    search_bar.send_keys(i)
    
    time.sleep(2)
    
    try:
        #try to find a link containing the artist name as text
        artist_link = driver.find_element_by_link_text(i)
        
        #if the sleep here was too short the link was unclickable
        time.sleep(5)
        
        #click the link 
        artist_link.click()
        
        time.sleep(2)
        
        #collect the html for this page with beautifulsoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        try:
            #append the monthly listener number to the spotify_monthly_listener list, using it's class to identify it
            #regex used to find the number from this text
            spotify_monthly_listeners.append(re.search('[0-9,]*',soup.find('span', class_="_85aaee9fc23ca61102952862a10b544c-scss").text).group())
            
            time.sleep(2)
        
        except:
            #append np.nan if there is no monthly listener number (usually because the link led to a self-titled album page)
            spotify_monthy_listeners.append(np.nan)
        
        #return to the Spotify search page
        driver.back()
    
    #if the link can not be found - this causes the chrome driver to go white and alert me to update an artist name
    #so that I can restart the loop from the last position and find an artist link
    except:
    
        driver.back()
        

driver.close()

There were more times that I needed to update artist names than I would have like. These are all the times that I had to correct an artist name from discogs to match the Spotify spelling:

In [None]:
top_artists[top_artists.index('Rolling Stones')]='The Rolling Stones'

top_artists[top_artists.index('Neil Young & Crazy Horse')]='Neil Young'

top_artists[top_artists.index('Bob Seger & The Silver Bullet Band')]='Bob Seger'

top_artists[top_artists.index('Godspeed You Black Emperor!')]="Godspeed You! Black Emperor"

top_artists[top_artists.index('Simon And Garfunkel')]="Simon & Garfunkel"

top_artists[top_artists.index('The Jimi Hendrix Experience')]='Jimi Hendrix'

top_artists[360] = "CHIC"

top_artists[354] ='CAN'

top_artists[352]= "The Mamas & The Papas"

top_artists[346] ="New Order"

top_artists[344]='Daryl Hall & John Oates'

spotify_df.iloc[332,0] = "Sisters of Mercy"

top_artists[324] = "The Go-Go's"

top_artists[309] = 'John Mellencamp'

top_artists[296] = "Run-D.M.C."

top_artists[295]='M/A/R/R/S'

top_artists[288] = 'Adam & The Ants'

top_artists[278]="Boards of Canada"

top_artists[267] = "Stevie Ray Vaughan"

top_artists[247] = "The Jesus and Mary Chain"

top_artists[245] = "Steve Miller Band"

top_artists[239] = "DEVO"

top_artists[229] = "Herb Alpert & The Tijuana Brass"

top_artists[228]="Patti Smith"

top_artists[226] = "Rodgers & Hammerstein"

top_artists[223]="The Sugarhill Gang"

top_artists[222]="John Lennon"

top_artists[214] = 'Snoop Dogg'

top_artists[213] = "Bob Marley & The Wailers"

top_artists[199]="Rainbow"

top_artists[191]='Bad Company'

top_artists[190]="The Specials"

top_artists[172]="Elvis Costello & The Attractions"

top_artists[167]='Bruce Springsteen'

top_artists[162] = "The Stooges"

top_artists[157]="N.W.A."

top_artists[145]="Paul & Linda McCartney"

top_artists[141]="Neil Young"

top_artists[138]='Tom Petty and the Heartbreakers'

top_artists[134]="Jimi Hendrix"

top_artists[124]="Air"

top_artists[118]="Allman Brothers Band"

top_artists[120]='Huey Lewis & The News'

top_artists[116]="Jimi Hendrix"

top_artists[117]="Derek & The Dominos"

top_artists[90]="Jean-Michel Jarre"

top_artists[76]='David Bowie'

top_artists[12]='Prince'

In [552]:
spotify_df=pd.DataFrame({'top_artists_spotify_name':top_artists,'spotify_monthly_listeners':spotify_monthly_listeners})

After looking at the final dataframe, spotify monthly listeners was np.nan for all artists with self-titled albums, or albums by other artists with the same name. Some values were really low and this was because the another artist with the name from Discogs was accessed (like Lauryn Hill vs. Ms. Lauryn Hill).

I had to manually correct these cases.

In [633]:
#pd.set_option('display.max_rows', 500)
#spotify_df

Unnamed: 0,top_artists_spotify_name,spotify_monthly_listeners,top_artists_orig_name
0,Pink Floyd,13643471,Pink Floyd
1,The Beatles,22982890,The Beatles
2,Michael Jackson,23635338,Michael Jackson
3,Led Zeppelin,14946310,Led Zeppelin
4,Fleetwood Mac,18598676,Fleetwood Mac
5,Dire Straits,11271675,Dire Straits
6,David Bowie,14064233,David Bowie
7,Miles Davis,2003331,Miles Davis
8,AC/DC,19922477,AC/DC
9,Eagles,14003141,Eagles


In [None]:
spotify_df.iloc[357,1] = '72,174'

spotify_df.iloc[357,0] = "Lil' Louis"

spotify_df.iloc[354,1] = '689,779'

spotify_df.iloc[350,1] = '2,723,145'

spotify_df.iloc[347,1] = '8,890,515'

spotify_df.iloc[339,1] = '5,235,964'

spotify_df.iloc[338,1] ='1,002,086'

spotify_df.iloc[334,1] = '12,379,977'

spotify_df.iloc[332,1] = '491,796'

spotify_df.iloc[332,0] = "Sisters of Mercy"

spotify_df.iloc[321,1] = '831,864'

spotify_df.iloc[311,1] = '2,280,677'

spotify_df.iloc[302,1] = '4,351,729'

spotify_df.iloc[302,0] = "Pretenders"

spotify_df.iloc[298,1] = '1,168,528'

spotify_df.iloc[296,1] = '3,622,251'

spotify_df.iloc[280,1] = '4,553,261'

spotify_df.iloc[280,0] = "John Mellencamp"

spotify_df.iloc[279,1] = '335,440'

spotify_df.iloc[265,1] = '1,178'

spotify_df.iloc[265,0] = "Chicago Transit Authority"

spotify_df.iloc[264,1] = '9,120,871'

spotify_df.iloc[264,0] = "The Offspring"

spotify_df.iloc[256,1] = '4,470,005'

spotify_df.iloc[256,0] = "Ms. Lauryn Hill"

spotify_df.iloc[255,1] = '3,253,977'

spotify_df.iloc[246,1] = '2,979,305'

spotify_df.iloc[246,0] = "Grateful Dead"

spotify_df.iloc[233,1] = '3,335,836'

spotify_df.iloc[232,1] ='7,006,906'

spotify_df.iloc[224,1] = '11,545,469'

spotify_df.iloc[213,1] = '13,960,826'

spotify_df.iloc[184,1] = '2,602,586'

spotify_df.iloc[180,1] = '4,351,729'

spotify_df.iloc[178,1] = '9,221,287'

spotify_df.iloc[178,0] = 'Paul McCartney'

spotify_df.iloc[174,1] = '20,254,501'

spotify_df.iloc[171,1] = '661,405'

spotify_df.iloc[165,1] = '1,690,573'

spotify_df.iloc[119,1] = '488,620'

spotify_df.iloc[110,1] = '8,632,236'

spotify_df.iloc[110,0] = "KISS"

spotify_df.iloc[105,1] = '15,082,896'

spotify_df.iloc[98,1] = '12,537,464'

spotify_df.iloc[98,0] = "TOTO"

spotify_df.iloc[96,1] = '4,918,032'

spotify_df.iloc[91,1] = '9,691,840'

spotify_df.iloc[87,1] = '7,351,870'

spotify_df.iloc[82,1] = '4,567,427'

spotify_df.iloc[22,0] = "Simon & Garfunkel"

spotify_df.iloc[22,1] = '8,839,283'

spotify_df.iloc[45,1] = '23,907,107'

spotify_df.iloc[57,1] = '5,576,557'

spotify_df.iloc[64,0] = "Wings"

spotify_df.iloc[64,1] = '40,38542'

spotify_df.iloc[77,0] = 'Yusef / Cat Stevens'

spotify_df.iloc[77,1] = '6,719,900'

spotify_df.iloc[265,0] = "Chicago"
spotify_df.iloc[265,1] = '6,398,167'

spotify_df.iloc[145,0] = "Paul McCartney"
spotify_df.iloc[145,1] = '9,221,287'

spotify_df.iloc[61,0] = "John Lennon"
spotify_df.iloc[61,1] = '7,589,240'


In [None]:
spotify_df['spotify_monthly_listeners'] = spotify_df['spotify_monthly_listeners'].str.replace(',','')

In [None]:
spotify_df['spotify_monthly_listeners'] = spotify_df['spotify_monthly_listeners'].astype('int')

In [636]:
with open('spotify_df_final.pickle', 'wb') as write_file:
    pickle.dump(spotify_df, write_file)