# Notebook 2: Scraping Spotify with Selenium for Total Monthly Listeners 

Import the packages required and set up the chromdriver

In [1]:
from bs4 import BeautifulSoup
import requests
import time, os
import pickle
import numpy as np
import pandas as pd
import re

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver
driver = webdriver.Chrome(chromedriver)

First, I get a list of the artists from the top 1050 collected records dataframe I pickled

In [2]:
os.chdir('/Users/beth/Documents/Metis/metis_project_2/metis_project2')

In [3]:
with open('top_1050_collected_records.pickle', 'rb') as open_file:
    top_1050_collected_records = pickle.load(open_file)

A copy of the original artists name to match back to the top 1050 collected records dataframe later. I only took unique names to avoid duplicate searches

In [4]:
top_artists_orig_names=list(top_1050_collected_records['artists'].unique())

A copy of names to edit in case I need to edit any for spotify

In [5]:
top_artists=list(top_1050_collected_records['artists'].unique())

There are some obvious issues with the artist names, some have parentheses with numbers after them that need to be removed

In [6]:
top_artists=list(map(lambda x: re.sub(' \([0-9]*\)','', x),top_artists))

Create a spotify monthly listeners list to populate

In [7]:
spotify_monthly_listeners=[]
artist_check=[]

The plan is to loop over all of the artists in the top_artists list, use Selenium to search for them on Spotify, open the link to thier Spotify page, and add thier monthly listeners to the spotify_monthly_listeners list. 

There were issues because of many artist names from Discogs are slightly different than in Spotify. This is not an issue for the search step, but for the link finding step. The link for the aritists have no consistant calss or id from search page to search page.

There were also issues when artists had self-titled albums, and I needed to avoid finding the album link instead of the artist link. 

To get around these issues I created a list of possible artist names that converts the discogs artist name to many other possible and partial names by removing the various 'and' symbols (+, &, and, And) and following text, adding or removing 'The', changing capitalization, and looking only for the first or last word of the artist name. I also only searched for links with 'artist' in the address, to avoid 'ablum' links.

Example of possible artist name variations and partial artist names that would work:

In [33]:
artist = 'Bruce Springsteen & the E Street Band'

possible_artist_names = [artist.lower(), artist.lower(), artist.strip(), \
'The ' + artist, re.sub('The ','', artist), re.sub(' and.*| And.*| \+.*| &.*','',artist), artist.split(' ')[0], \
artist.split(' ')[-1]]

possible_artist_names

['bruce springsteen & the e street band',
 'bruce springsteen & the e street band',
 'Bruce Springsteen & the E Street Band',
 'The Bruce Springsteen & the E Street Band',
 'Bruce Springsteen & the E Street Band',
 'Bruce Springsteen',
 'Bruce',
 'Band']

Loop to collect total spotify monthly listeners:

In [12]:
#open the spotify search page using chromedriver
driver.get('https://open.spotify.com/search/')

#loop over all the top_artists
for artist in top_artists:
    
    time.sleep(2)
    
    #locate the Spotify search bar 
    search_bar = driver.find_element_by_xpath("//input[@data-testid='search-input']")
    
    #clear the default text in the search bar
    search_bar.clear()
    
    #input the artist name in the search bar, the page updates automatically as you type
    search_bar.send_keys(artist)
    
    time.sleep(2)
    
    try:
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        #reset artist link
        artist_link=''
        
        #try to find a link containing artist in the address
        #and link text matching some variation of the artist name
        #take the first link found, most likely to be the actual artist link
        possible_artist_names = [artist.lower(), artist.lower(), artist.strip(), \
            'The ' + artist, re.sub('The ','', artist), re.sub(' and.*| And.*| \+.*| &.*','',artist), \
                artist.split(' ')[0], artist.split(' ')[-1]]
        
        for link in soup.find_all('a'):
            if 'artist' in link.get('href') and len([i for i in possible_artist_names if i in link.text]) > 0:
                artist_link='https://open.spotify.com'+ link.get('href')
                break
    
        #go to the link 
        #Selenium was needed to collect dynamic page html
        driver.get(artist_link)
        
        time.sleep(2)
        
        #collect the html for this page with beautifulsoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        try:
            #use the unique class to identify the element containg monthly listener number
            #regex used to find the number from this text
            #append the number to the spotify_monthly_listener list 
            spotify_monthly_listeners.append(re.search('[0-9,]*',soup.find('span', class_="_85aaee9fc23ca61102952862a10b544c-scss").text).group())
            
            artist_check.append(artist)
            
        
        except:
            #append np.nan if there is no monthly listener number (usually because the link led to a self-titled album page)
            spotify_monthy_listeners.append(np.nan)
        
        #return to the Spotify search page
        driver.back()
    
    #if the link can not be found - this causes the chrome driver to go white and alert me to update an artist name
    #so that I can restart the loop from the last position and find an artist link
    except:
    
        driver.back()
        

driver.close()

spotify_monthly_listeners

['13,655,133',
 '22,962,171',
 '23,801,865',
 '14,948,178',
 '19,230,751',
 '11,310,818',
 '14,065,679',
 '2,009,273',
 '20,027,871',
 '14,058,196',
 '8,102,636',
 '18,322,366',
 '9,020,524',
 '12,571,624',
 '15,635,629',
 '11,591,775',
 '6,071,866',
 '8,732,122',
 '32,907,019',
 '5,915,635',
 '8,990,357',
 '15,993,723',
 '8,846,958',
 '8,846,958',
 '13,686,592',
 '7,899,964',
 '19,661,947',
 '3,690,827',
 '1,306,505',
 '3,641,151',
 '13,820,103',
 '754,694',
 '5,905,103',
 '8,846,958',
 '2,916,850',
 '9,252',
 '19,661,947',
 '9,026,242',
 '6,128,363',
 '7,065,096',
 '14,151,292',
 '1,542,026',
 '7,812,039',
 '3,058,169',
 '6,135,788',
 '24,494,593',
 '1,705,684',
 '13,372,756',
 '3,819,967',
 '6,684,579',
 '5,063,180',
 '3,786,392',
 '7,614,795',
 '15,065,961',
 '8,901,913',
 '9,807,525',
 '4,154,468',
 '5,576,356',
 '4,520,040',
 '11,911,938',
 '3,895,071',
 '7,614,795',
 '6,210,210',
 '7,977,532',
 '4,039,343',
 '9,264,689',
 '8,508,448',
 '14,182,216',
 '5,988,729',
 '14,531,176',


Create a dataframe with discogs artist name, spotify artists name and monthly listeners

In [None]:
spotify_df=pd.DataFrame({'top_artists_spotify_name':top_artists,'spotify_monthly_listeners':spotify_monthly_listeners})

In [None]:
spotify_df['spotify_monthly_listeners'] = spotify_df['spotify_monthly_listeners'].str.replace(',','')

In [None]:
spotify_df['spotify_monthly_listeners'] = spotify_df['spotify_monthly_listeners'].astype('int')

Pickle the file. Some manual changes were made to correct obvious errors (lower than expected values - usually the wrong artist link of an artist with a similar nam)

In [636]:
#with open('spotify_df_final.pickle', 'wb') as write_file:
#    pickle.dump(spotify_df, write_file)