# Fortune 500 Web scrape (Selenium and BeautifulSoup)

## 0. Import modules

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select

import requests
import time
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import html5lib

In [2]:
import os.path

## Helper functions

In [3]:
def ceoFounder():
    driver.find_element_by_xpath("//label[@for='ceofounder']").click()
    time.sleep(15)

In [4]:
def femaleCEO():
    driver.find_element_by_xpath("//label[@for='ceowoman']").click()

In [5]:
def pageTurner():
    driver.find_element_by_xpath("//div[@class='-next']").click()

## 1. Scrape Initial data

In [6]:
path = "/Users/hongbinlin/Downloads/chromedriver"
driver = webdriver.Chrome(path)

In [7]:
def web_scraping(func=0):
    # Initial the dataframe
    columns =['Rank','Name','Revenue ($M)','Revenue % change','Profit ($M)','Profit % change',\
          'Assets ($M)','Market Value ($M)','Change in rank (1000)','Employees','Change in rank (500)',\
          'Year']
    data_list=[]

    for year in range(2017,2021): # should be 2017 - 2021(2017 for testing)

        # Activate the Chrome Web Driver
        URL = 'https://fortune.com/fortune500/{}/search'.format(year)
        driver.get(URL)

        # Wait until the page is loaded
        wait = WebDriverWait(driver,15)
        element = wait.until(EC.presence_of_element_located((By.CLASS_NAME,"ReactTable")))

        # Apply Filter
        if (year == 2017) and (func!=0):
            continue
        else:
            if func==femaleCEO:
                femaleCEO()
            elif func==ceoFounder:
                ceoFounder()
            else:
                pass

        # Set the number of rows in each page
        select = Select(driver.find_element_by_xpath("//select[@aria-label='rows per page']"))
        select.select_by_value('100')

        # Locate the number of page
        pages  = driver.find_element_by_xpath("//span[@class='-pageInfo']/span[@class='-totalPages']").text
        pages = int(pages)

        # Iterate the pages to scrape the values
        for page in range (1, pages+1):

            # Turn the page to BeautifulSoup
            pagesource = driver.page_source
            soup = BeautifulSoup(pagesource,'html.parser')

            # all rows
            rows = soup.find_all('div', class_='rt-tr-group')
            for row in rows:
                row_list = []
                # all cols
                cols = row.find_all('div', class_='searchResults__cellContent--3WEWj')
                for col in cols:
                    value = col.text
                    # Append the value
                    row_list.append(value)
                # Append the year
                row_list.append(year)

                # For 2017, the order of Change in rank (1000) and Employees is different
                if year==2017:
                    temp=row_list[8]
                    row_list[8]=row_list[9]
                    row_list[9]=temp
                
                data_list.append(row_list)

            # Page Turner
            if (pages!=1) and (page<pages):
                next_page = WebDriverWait(driver, 3).until(EC.element_to_be_clickable((By.CLASS_NAME, "-next")))
                next_page.click()

    df=pd.DataFrame(data_list,columns=columns)
    return df



In [9]:
# Obtain three datasets(Overall, Female CEO, Founder CEO)
df_all=web_scraping()
df_female=web_scraping(femaleCEO)
df_founder=web_scraping(ceoFounder)

In [10]:
# Back up the three datasets
df_copy = df_all.copy()
df_founder_copy=df_founder.copy()
df_female_copy=df_female.copy()

# Empty list
founder_check=[]
female_check=[]
founder_list=[]
female_list=[]

# Obtain the rank and year of founder and female for further matching
for item in zip(df_founder_copy['Rank'],df_founder_copy['Year']):
    founder_list.append(item)
for item in zip(df_female_copy['Rank'],df_female_copy['Year']):
    female_list.append(item)

# Iterate each row in overall dataframe
for index in range(df_copy.shape[0]):

    # obtain the rank and year for each row
    check = tuple(df_copy.loc[index,['Rank','Year']])

    # Bool check in Founder CEO
    if check in founder_list:
        founder_check.append(1)
    else:
        founder_check.append(0)
    
    # Bool check in Female CEO
    if check in female_list:
        female_check.append(1)
    else:
        female_check.append(0)

df_copy['Female CEO'] = female_check
df_copy['Founder CEO'] = founder_check
df_copy.to_csv('Fortune500.csv',index=False)

## Scraping more detail for only 2020(Another dataset)

In [12]:
def initialise_header(rows,financials):
    with open ('Fortune500-2.csv','w') as r:
        for row in rows:
            data = row.find_all('div')
            heading = data[0].text
            r.write(heading)
            r.write(',')
        for financial in financials:
            heading = financial.find('div').text
            if ',' in heading:
                heading = heading.replace(',','')
            r.write(heading)
            r.write(',')
        r.write('\n')

In [13]:
def scrape_page_data(rows,financials):
     with open ('Fortune500-2.csv','a') as r:
            for row in rows:
                data = row.find_all('div')
                heading = data[0].text
                if heading == 'Website':
                    col = row.a.text
                else:
                    col = data[-1].text

                if ',' in col:
                    col = col.replace(',','')
                r.write(col)
                r.write(',')
                
            for financial in financials:
                value = financial.find('div', class_='dataTable__value--2wIAD').text
                if ',' in value:
                    value = value.replace(',','')
                r.write(value)
                r.write(',')
            r.write('\n')

In [14]:
def store_failed_scrapes(failed_scrapes):
    '''
    This function shouldn't run with the new method. i.e. extracting url from 'next' pagination, instead of 'guessing' url from company name.
    AT&T is just ATT,
    Amazon.com is just Amazon
    Tapestry is actually Coach'''

    if os.path.exists('Fortune500-2-failed.csv'):
        with open ('Fortune500-2-failed.csv','a') as r:
            for company in failed_scrapes:
                r.write(company)
                r.write('\n')
        
    else:
        with open ('Fortune500-2-failed.csv','w') as r:
            for company in failed_scrapes:
                r.write(company)
                r.write('\n')

In [15]:
'''
This method would be a lot faster if we store the urls in a list, and then use BS4 in future to extract 
directly from the html, as opposed to using Selenium to render each page. 
Sadly, we need the page to render to get the next url...
However, this is still a better solution than just 'guessing' the url. It makes the method a lot more 
robust for changes in the different years.
'''

path = "/Users/hongbinlin/Downloads/chromedriver"
driver = webdriver.Chrome(path)

try:
    failed_scrapes = []
    company = 'Walmart'
    webpage = "https://fortune.com/company/{}/fortune500/".format(company)
    
# This range will scrape n number of pages(max = 1000), 
# given that company variable is Walmart ie. 1st position
    for i in range(1000):
#         print(webpage)
        driver.get(webpage)

        # This is important to ensure that the webpage loads all of the data
        try:
            temp = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, "info__wrapper--1CxpW")))

        except:
#             This won't run as company variable is no longer in a list
#             failed_scrapes.append(company)
            continue
            
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        information = soup.find('div', class_='info__wrapper--1CxpW')
        rows = information.find_all('div', class_='info__row--7f9lE')
        financials = soup.find_all('div', class_='dataTable__row--3ws_o')

        if i == 0:
            initialise_header(rows,financials)
        
        scrape_page_data(rows,financials)

        pagination = soup.find('div', class_='companySinglePagination__paginationWrapper--2m5Dj')
        urls = pagination.find_all('a', href=True)
        next_page = urls[-1]['href']
        webpage = next_page
        
finally:
    driver.quit()
#     store_failed_scrapes(failed_scrapes)