# Fortune 500 Web scrape (Selenium and BeautifulSoup)

## 0. Import modules

In [None]:
from selenium import webdriver
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select

import requests
import time
import pandas as pd
from bs4 import BeautifulSoup
import html5lib

In [None]:
import os.path

## 1. Scrape Initial data

In [None]:
def create_headers(soup):

    table = soup.find('div', class_="ReactTable")

    headers = []
    # Class is unique is constant throughout the html for ALL headings.
    theaders = table.find_all('div', class_='searchResults__columnTitle--1Brf4')
    for header in theaders:
        headers.append(header.text)
    print(headers)
    with open ('Fortune500.csv','w') as r:
        for col in headers:
            # Prevents additional columns being creating, due to csv format.
            if ',' in col:
                col = col.replace(',','')
            r.write(col)
            # # Prevents the creation of an extra column.
            if col != headers[-1]:
                r.write(',')
        r.write('\n')

In [None]:
def scrape_data(soup):

    table = soup.find('div', class_="ReactTable")
    tbody = table.find('div', class_="rt-tbody")
    rows = tbody.find_all('div', class_='rt-tr-group')

    with open ('Fortune500.csv','a') as r:        
        for row in rows:
            # Class is unique is constant throughout the html for ALL data points.
            cols = row.find_all('div', class_='searchResults__cellContent--3WEWj')
            for col in cols:
                value = col.text
                # Prevents additional columns being creating, due to csv format.
                if ',' in value:
                    value = value.replace(',','')
                r.write(value)
                r.write(',')
            r.write('\n')

In [None]:
path = "/Users/hongbinlin/Downloads/chromedriver"
driver = webdriver.Chrome(path)

webpage = "https://fortune.com/fortune500/2020/search/"
driver.get(webpage)

In [None]:
try:
    # This is important to ensure that the webpage has rendered.
    table = WebDriverWait(driver, 10).until(
        EC.presence_of_element_located((By.CLASS_NAME, "rt-tbody")))
    print(table)
    # Show 100 rows
    select = Select(driver.find_element_by_xpath('//*[@id="content"]/div[2]/div[2]/div/div[2]/div/div[2]/span[2]/select'))
    select.select_by_value('100')
    
    page = 1
    while page <= 10:    
        print("Scraping Page: {}".format(page))
        
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        
        if page == 1:
            create_headers(soup)
        scrape_data(soup)
        
        show_more = WebDriverWait(driver, 3).until(
            EC.element_to_be_clickable((By.CLASS_NAME, "-next")))
        show_more.click()
        page += 1


    print("Pages scraped: {}".format(page-1))
    print('Complete')

except NoSuchElementException:
    print("Table not found. Closing application.")
    
finally:
    driver.quit()

In [None]:
df = pd.read_csv('Fortune500.csv', index_col=False)

In [None]:
df.tail()

## 2. Get additional data columns from company specific website on Fortune

In [None]:
def initialise_header(rows,financials):
    with open ('Fortune500-2.csv','w') as r:
        for row in rows:
            data = row.find_all('div')
            heading = data[0].text
            r.write(heading)
            r.write(',')
        for financial in financials:
            heading = financial.find('div').text
            if ',' in heading:
                heading = heading.replace(',','')
            r.write(heading)
            r.write(',')
        r.write('\n')

In [None]:
def scrape_page_data(rows,financials):
     with open ('Fortune500-2.csv','a') as r:
            for row in rows:
                data = row.find_all('div')
                heading = data[0].text
                if heading == 'Website':
                    col = row.a.text
                else:
                    col = data[-1].text

                if ',' in col:
                    col = col.replace(',','')
                r.write(col)
                r.write(',')
                
            for financial in financials:
                value = financial.find('div', class_='dataTable__value--2wIAD').text
                if ',' in value:
                    value = value.replace(',','')
                r.write(value)
                r.write(',')
            r.write('\n')

In [None]:
def store_failed_scrapes(failed_scrapes):


    '''This function shouldn't run with the new method. i.e. extracting url from 'next' pagination, instead of 'guessing' url from company         name. AT&T is just ATT, Amazon.com is just Amazon Tapestry is actually Coach'''

    if os.path.exists('Fortune500-2-failed.csv'):
        with open ('Fortune500-2-failed.csv','a') as r:
            for company in failed_scrapes:
                r.write(company)
                r.write('\n')
        
    else:
        with open ('Fortune500-2-failed.csv','w') as r:
            for company in failed_scrapes:
                r.write(company)
                r.write('\n')

In [None]:
'''
This method would be a lot faster if we store the urls in a list, and then use BS4 in future to extract 
directly from the html, as opposed to using Selenium to render each page. 
Sadly, we need the page to render to get the next url...
However, this is still a better solution than just 'guessing' the url. It makes the method a lot more 
robust for changes in the different years.
'''

path = "/Users/hongbinlin/Downloads/chromedriver"
driver = webdriver.Chrome(path)

try:
    failed_scrapes = []
    company = 'Walmart'
    webpage = "https://fortune.com/company/{}/fortune500/".format(company)
    
# This range will scrape n number of pages(max = 1000), 
# given that company variable is Walmart ie. 1st position
    for i in range(10):
#         print(webpage)
        driver.get(webpage)

        # This is important to ensure that the webpage loads all of the data
        try:
            temp = WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CLASS_NAME, "info__wrapper--1CxpW")))

        except:
#             This won't run as company variable is no longer in a list
#             failed_scrapes.append(company)
            continue
            
        page_source = driver.page_source
        soup = BeautifulSoup(page_source, 'html.parser')
        information = soup.find('div', class_='info__wrapper--1CxpW')
        rows = information.find_all('div', class_='info__row--7f9lE')
        financials = soup.find_all('div', class_='dataTable__row--3ws_o')

        if i == 0:
            initialise_header(rows,financials)
        
        scrape_page_data(rows,financials)

        pagination = soup.find('div', class_='companySinglePagination__paginationWrapper--2m5Dj')
        urls = pagination.find_all('a', href=True)
        next_page = urls[-1]['href']
        webpage = next_page
        
finally:
    driver.quit()
#     store_failed_scrapes(failed_scrapes)

## 3. Data cleaning

In [None]:
dollar_cols = ['Revenues ($M)',
               'Profits ($M)', 
               'Assets ($M)',
               'Market Value — as of March 31 2020 ($M)']

In [None]:
for col in dollar_cols:
    try:
        df[col] = df[col].str.replace('-','0')
        df[col] = df[col].str.replace('$','').astype('float')
        print(col)
    except:
        continue