In [15]:
import pandas as pd
import numpy as np
import re
import bs4
import time

# selenium
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager

### SETTINGS

In [210]:
base_url = 'https://finance.yahoo.com/quote/<TICK>/sustainability'
# replace <TICK> for desired ticker

screen_url = 'https://finance.yahoo.com/screener/unsaved/63641ef9-e86c-42c9-b66e-0e9481591369?offset=0&count=100'
# url to screener. Contains filter on ESG rating (rating > -1) so that we know ESG data is available for these stocks

read_or_scrape_names = 'scrape'
# 'scrape' for scraping (takes 2 minutes)
# 'read' for reading from previously scraped .csv file

### SEARCH FUNCTIONS

In [299]:
def get_risk_ratings(soup):
    
    '''
    Reads the ESG Risk Rating, its percentile, and the E, S, and G subratings from a BeautifulSoup-object. 
    Returns a dict.
    '''
    
    # get total esg score
    try:
        total_esg_score = soup.find("div", {"class":'Fz(36px) Fw(600) D(ib) Mend(5px)'}).get_text()
    except:
        total_esg_score = 'NA'
     
    # get percentile of total esg score
    try:
        total_esg_score_pctile = re.findall(r'\d+', soup.find("span", {"class":"Bdstarts(s) Bdstartw(0.5px) Pstart(10px) Bdc($seperatorColor) Fz(12px) smartphone_Bd(n) Fw(500)"}).get_text())[0]
    except:    
        total_esg_score_pctile = 'NA'
        
    # get environment risk score
    try:
        env_risk_score = soup.findAll("div", {"class":'D(ib) Fz(23px) smartphone_Fz(22px) Fw(600)'})[0].get_text()
    except:
        env_risk_score = 'NA'
           
    # get social risk score
    try:
        soc_risk_score = soup.findAll("div", {"class":'D(ib) Fz(23px) smartphone_Fz(22px) Fw(600)'})[1].get_text()
    except:
        soc_risk_score = 'NA'
        
    # get governance risk score
    try:
        gov_risk_score = soup.findAll("div", {"class":'D(ib) Fz(23px) smartphone_Fz(22px) Fw(600)'})[2].get_text()
    except:
        gov_risk_score = 'NA'

    
    return {'total_esg_score':total_esg_score, 'total_esg_score_percentile':total_esg_score_pctile, 'environmental_risk_score':env_risk_score, 'social_risk_score':soc_risk_score, 'governance_risk_score':gov_risk_score}

def get_controversy_levels(soup):
    
    '''
    Reads the Sustainalytics Controversy Level and the date of latest update from a BeautifulSoup-object. 
    Returns a dict.
    '''
        
    # get controversy level
    try:
        controv_level = soup.find("div", {"class":"D(ib) Fz(36px) Fw(500)"}).get_text()
    except:
        controv_level = 'NA'
    
    # get last update date
    try:
        last_update = soup.find("div", {"class":"Mt(20px) Mb(15px) smartphone_Px(20px) smartphone_Mt(50px) smartphone_Mb(0px)! Fz(12px) C($tertiaryColor)"}).get_text().split('updated on ')[1]
    except:
        last_update = 'NA'
    
    return {'controversy_level':controv_level, 'last_update':last_update}

def get_product_involvement(soup):
    
    '''
    Reads product involvement information from a BeautifulSoup-object. 
    Returns a dict.
    '''
    
    out = {}
    
    rows = soup.findAll("tr", {"class":"Lh(40px) Bdbs(s) Bdts(s) Bdbw(1px) Bdtw(1px) Bdbc($seperatorColor) Bdtc($seperatorColor)"})
    for row in rows:
        try:
            product = 'flag_' + row.findAll('span')[0].get_text().lower().replace(' ','_')
        except:
            product = 'NA'

        if row.findAll('span')[1].get_text() == 'Yes':
            involve = 1
        elif row.findAll('span')[1].get_text() == 'No':
            involve = 0
        else:
            involve = np.NaN
            
        out[product] = involve
    
    return out

def get_names(soup):
    
    '''
    Reads all stock tickers and associated company names from one page of the Yahoo Finance stock screener in the form of bs4 object.
    Returns a list of lists.
    '''
    
    # get all stock tickers
    names = soup.findAll("tr", {"class":["simpTblRow Bgc($hoverBgColor):h BdB Bdbc($seperatorColor) Bdbc($tableBorderBlue):h H(32px) Bgc($lv2BgColor)", "simpTblRow Bgc($hoverBgColor):h BdB Bdbc($seperatorColor) Bdbc($tableBorderBlue):h H(32px) Bgc($lv1BgColor)"]})
    
    if len(names) > 0:
        # iterate over names
        out = []
        for name in names:
            try:
                stock_name = name.find("td", {"aria-label":"Name"}).get_text()
            except:
                stock_name = 'NA'

            try:
                stock_tick = name.find("a", {"class":"Fw(600) C($linkColor)"}).get_text()
            except:
                stock_tick = 'NA'

            out.append([stock_name, stock_tick])
            
        return out
    
    else:
        raise ValueError('No valid tickers detected in HTML source.')

### SCRAPE STOCK UNIVERSE

In [376]:
if read_or_scrape_names == 'read':
    
    # read previously scraped tickers
    names = pd.read_csv('yahoo_finance_sustainalytics_tickers.csv')

elif read_or_scrape_names == 'scrape':
    
    # open page 
    screener = webdriver.Chrome(ChromeDriverManager().install())
    screener.get(screen_url)
    
    time.sleep(5) # allow manual consent to cookies
    
    # get html
    html = screener.execute_script("return document.body.innerHTML;")
    soup = bs4.BeautifulSoup(html, 'lxml')
    
    # get number of results
    n3 = int(soup.find("span", {"class":"Mstart(15px) Fw(500) Fz(s)"}).get_text().split(' of ')[1].split(' results')[0])          
    
    # store
    outs = []
    outs.extend(get_names(soup))
    
    prev = 0
    for n in np.arange(1, nrep+1):
    
        n1 = n * 100
        n2 = min((n + 1) * 100, n3)
        
        # url of new page to load
        screen_url = screen_url.replace(f'offset={prevn}', f'offset={n1}')
        
        # load new page
        screener.get(screen_url)
        html = screener.execute_script("return document.body.innerHTML;")
        soup = bs4.BeautifulSoup(html, 'lxml')
        
        # read names
        outs.extend(get_names(soup))
        
        # print info
        print(f'Scraping {n1+1} - {n2} of {n3} results...', end='\r')
        
        time.sleep(2)
        prevn = n1
        
    names = pd.DataFrame(outs, columns=['CompanyName', 'Ticker'])

Scraping 1501 - 1539 of 1539 results...


### OPEN STOCK DASHBOARD

In [375]:
keys = ['comp_ticker', 'comp_name', 'total_esg_score', 'total_esg_score_percentile', 'environmental_risk_score', 'social_risk_score', 'governance_risk_score', 'controversy_level', 'last_update', 'flag_alcoholic_beverages', 'flag_adult_entertainment', 'flag_gambling', 'flag_tobacco_products', 'flag_animal_testing', 'flag_fur_and_specialty_leather', 'flag_controversial_weapons', 'flag_small_arms', 'flag_catholic_values', 'flag_gmo', 'flag_military_contracting', 'flag_pesticides', 'flag_thermal_coal', 'flag_palm_oil']
results = {}
for key in keys:
    results[key] = []

page_open = False
for i, row in names.iterrows():
    comp_name = row['CompanyName']
    comp_tick = row['Ticker']
    
    # first time opening
    if not page_open:
        driver = webdriver.Chrome(ChromeDriverManager().install())
        driver.get(base_url.replace('<TICK>',comp_tick))
        time.sleep(5) # consent to cookies
        page_open = True
    else:
        driver.get(base_url.replace('<TICK>',comp_tick))
        
    # get html
    html = driver.execute_script("return document.body.innerHTML;")
    soup = bs4.BeautifulSoup(html, 'lxml')
    
    # get various ESG data
    esg = get_risk_ratings(soup)
    controv = get_controversy_levels(soup)
    prod_inv = get_product_involvement(soup)
    
    # append data
    results['comp_ticker'].append(comp_tick); results['comp_name'].append(comp_name)
    
    for k, v in esg.items():
        results[k].append(v)
    
    for k, v in controv.items():
        results[k].append(v)
        
    for k, v in prod_inv.items():
        results[k].append(v)
    
    print(f'Reading {i+1} of {len(names)}: {comp_tick} ({comp_name})...', end='\r')

Reading 1539 of 1539: ISSRF (iShares VI Public Limited Company - iShares Edge MSCI EM Minumum Volatility UCITS ETF)


### SAVE

In [366]:
res = pd.DataFrame.from_dict(results)
res.replace('NA', np.NaN, inplace=True)
res.dropna(subset=['total_esg_score', 'social_risk_score', 'controversy_level'], how='all', inplace=True)
res.to_csv('yahoo_finance_sustainalytics.csv', index=False)