# Scraping Wikipedia with Selenium

## Imports

In [8]:
import os
import pandas as pd
from datetime import date
from tqdm.auto import tqdm
from selenium import webdriver
from pymongo import MongoClient
from dotenv import dotenv_values
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support import expected_conditions as EC

ModuleNotFoundError: No module named 'webdriver_manager'

In [12]:
webdriver_path = ChromeDriverManager().install()
driver = webdriver.Chrome(service=Service(webdriver_path))


NameError: name 'ChromeDriverManager' is not defined

## Connections and Settings

In [None]:
# MonogoDB
MONGODB_CONNECTION_STRING = f"mongodb+srv://{dotenv_values('.env')['MONGODB_USERNAME']}:{dotenv_values('.env')['MONGODB_PASSWORD']}@{dotenv_values('.env')['MONGODB_CLUSTER']}.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(MONGODB_CONNECTION_STRING)
db = client['glassdoor']

# Selenium
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

## Scraping Utils

In [None]:
def wikipedia_table_title_to_csv_file_name(title):
    return (title.split("[edit]")[0] + " " + str(date.today()) + ".csv").replace(" ", "_")

TODO: Add the same function with [Scrapy](https://scrapy.org/) and with [BeautifulSoup](https://medium.com/analytics-vidhya/web-scraping-a-wikipedia-table-into-a-dataframe-c52617e1f451).

In [None]:
# Selenium
def scrape_tables_by_url(url):
  driver.get(url)
  table_titles = driver.find_elements(By.XPATH, '//table[contains(@class, "wikitable")]//preceding-sibling::h2')
  tables = driver.find_elements(By.XPATH, '//table[contains(@class, "wikitable")]')
  for table, title in zip(tables, table_titles):
    df = pd.read_html(table.get_attribute('outerHTML'), encoding="utf-8")[0]
    csv_path = f"data/{wikipedia_table_title_to_csv_file_name(title.text)}"
    os.makedirs(csv_path)
    df.to_csv(csv_path, index=False, encoding="utf-8")

In [None]:
# Selenium
def scrape_companies_table_by_url(url, replacements=None):
  driver.get(url)
  table = driver.find_element(By.XPATH, '//table[contains(@class, "wikitable")][1]')
  
  df = pd.read_html(table.get_attribute('outerHTML'), encoding="utf-8")[0]
  company_links = driver.find_elements(By.XPATH, f'//table[contains(@class, "wikitable")][1]/tbody/tr/td[2]/a')
  df["Wikipedia Company Page URL"] = [link.get_attribute("href") for link in company_links]
  df["Company Name"] = ""
  new_df = pd.DataFrame(columns=df.columns)
  for i in tqdm(range(len(df))):
    security = df.loc[i, "Security"]
    if replacements and (security in replacements):
      company_name = replacements[security]
      if isinstance(company_name, list):
        for name in company_name:
          new_df.loc[len(new_df)] = df.loc[i]
          new_df.loc[len(new_df)-1, "Company Name"] = name
      elif isinstance(company_name, str):
        new_df.loc[len(new_df)] = df.loc[i]
        new_df.loc[len(new_df)-1, "Company Name"] = company_name
    else:
      driver.get(df.loc[i, "Wikipedia Company Page URL"])
      page_title = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, f'//h1[contains(@id, "firstHeading")]')))
      company_name = page_title.text.split("(")[0].strip()
      new_df.loc[len(new_df)] = df.loc[i]
      new_df.loc[len(new_df)-1, "Company Name"] = company_name
  
  return new_df.drop_duplicates("Company Name").sort_values("Company Name").reset_index(drop=True)

## DB Utils

In [9]:
def insert_companies_data_to_db(df):
  companies_collection = db["companies"]
  ids = companies_collection.insert_many(df.to_dict('records'))
  return ids

## Run

In [10]:
def get_companies_table_and_save_to_db(url, replacements=None):
  df = scrape_companies_table_by_url(url, replacements=replacements)
  df.to_csv("s&p500_components_2020.csv", index=False)
  # ids = insert_companies_data_to_db(df)
  # return ids

In [11]:
# Wikipedia updated list of S&P500 components
# URL = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"

# Wikipedia list of S&P500 components as of 1.1.2020
URL = "https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=933578501"
REPLACEMENT = {
  "Alliance Data Systems": "Bread Financial",
  "Alphabet Inc Class A": None, # holding company with few to none reviews
  "Alphabet Inc Class C": None, # holding company with few to none reviews
  "American Water Works Company Inc": "American Water",
  "Apache Corporation": "Apache",
  "Booking Holdings Inc": None, # holding company with few to none reviews
  "Charter Communications": "Spectrum",
  "Cisco Systems": "Cisco Systems",
  "Citizens Financial Group": "Citizens",
  "CMS Energy": "Consumers Energy",
  "Cognizant Technology Solutions": "Cognizant Technology Solutions",
  "Concho Resources": "Concho",
  "D. R. Horton": "D.R. Horton",
  "E*Trade": None,  # company does not exist anymore
  "Edison Int'l": "Southern California Edison",
  "Fortive Corp": None, # holding company with few to none reviews
  "Fortune Brands Home & Security": None, # holding company with few to none reviews
  "Grainger (W.W.) Inc.": "Grainger",
  "HollyFrontier Corp": "HollyFrontier Corporation",
  "Howmet Aerospace": "Arconic",
  "J. B. Hunt Transport Services": "J.B. Hunt Transport",
  "JPMorgan Chase": None, # There are no reviews in 2017-2019
  "Kellogg's": "Kellogg Company",
  "Kohl's Corp.": "Kohl's",
  "Loews Corp.": "Loews Hotels & Co",
  "Lowe's Cos.": "Lowe's Home Improvement",
  "National Oilwell Varco Inc.": "National Oilwell Varco",
  "ViacomCBS": "Paramount",
  "Pinnacle West Capital": None, # holding company with few to none reviews
  "PNC Financial Services": "PNC Financial Services Group",
  "PPG Industries": "PPG",
  "Prudential Financial": "Prudential",
  "Truist Financial": None, # There are no reviews in 2017-2019
  "Under Armour Class A": "Under Armour",
  "Under Armour Class C": "Under Armour",
  "United Airlines Holdings": "United Airlines",
  "Waste Management Inc.": "Waste Management Inc.",
  "Williams Cos.": "Williams",
  "Xylem Inc.": "Xylem",
}
get_companies_table_and_save_to_db(URL, replacements=REPLACEMENT)


NameError: name 'driver' is not defined