# Scraping Glassdoor

## Imports

In [1]:
import re
import os
import pandas as pd
from glob import glob
from time import sleep
from datetime import date
from tqdm.auto import tqdm
from selenium import webdriver
from pymongo import MongoClient
from dotenv import dotenv_values
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import WebDriverException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import StaleElementReferenceException

## Constants

In [2]:
# Selenium
WAIT_TIME = 10
REPEAT_TIMES = 5

# Glassdoor
STARS_CLASSES_DICT = {
    'css-xd4dom': 1,
    'css-18v8tui': 2,
    'css-vl2edp': 3,
    'css-1nuumx7': 4,
    'css-s88v13': 5
}

V_X_DICT = {
    'css-hcqxoa': 'high',
    'css-1h93d4v': 'middle',
    'css-1kiw93k': 'low',
    'css-10xv9lv': None
}

## Connections and Settings

In [3]:
# Selenium
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))




[WDM] - Current google-chrome version is 103.0.5060
[WDM] - Get LATEST chromedriver version for 103.0.5060 google-chrome
[WDM] - About to download new driver from https://chromedriver.storage.googleapis.com/103.0.5060.134/chromedriver_win32.zip
[WDM] - Driver has been saved in cache [C:\Users\pc_office\.wdm\drivers\chromedriver\win32\103.0.5060.134]


## Scraping Utils

TODO: Add VPN or Proxy for [ip change](https://python.plainenglish.io/3-ways-to-change-your-ip-address-with-selenium-and-python-d3a48a92214e).

### Login

In [4]:
def login_to_glassdoor():
    try:
        elem = WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.XPATH, '//input[contains(@id, "sc.keyword")]')))
        return
    except TimeoutException:
        pass
    
    url = "https://www.glassdoor.com/profile/login_input.htm"
    driver.maximize_window()
    driver.get(url)
    sleep(WAIT_TIME)
    username_input = driver.find_element(By.XPATH, '//input[contains(@id, "inlineUserEmail")]')
    username_input.send_keys(dotenv_values('.env')['GLASSDOOR_USERNAME'])
    sleep(1)
    password_input = driver.find_element(By.XPATH, '//input[contains(@id, "inlineUserPassword")]')
    password_input.send_keys(dotenv_values('.env')['GLASSDOOR_PASSWORD'])
    sleep(1)
    login_submit_button = driver.find_element(By.XPATH, '//button[contains(@name, "submit")]')
    login_submit_button.click()
    sleep(WAIT_TIME)

### Company Metadata

In [5]:
def fetch_company_metadata(company_name: str):
  company_name_for_url = company_name.replace("&", "%26").replace(" ", "%20").replace("'", "%27").replace(",", "%2C")
  driver.get(f'https://www.glassdoor.com/Search/results.htm?keyword={company_name_for_url}')
  company_tile_link = WebDriverWait(driver, 30).until(EC.presence_of_element_located((By.XPATH, '//a[contains(@data-test, "company-tile")]')))
  company_glassdoor_page_url = company_tile_link.get_attribute("href")
  try:
    company_tile_img = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[1]/img')
    company_glassdoor_logo_url = company_tile_img.get_attribute("src")
  except NoSuchElementException:
    company_glassdoor_logo_url = ""
  company_tile_rating = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[1]//strong')
  company_glassdoor_overall_rating = company_tile_rating.text.split(" ")[0].strip()
  company_tile_title = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/h3')
  company_glassdoor_title = company_tile_title.text.strip()
  company_tile_sub_title = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/div[1]')
  company_tile_sub_title_list = company_tile_sub_title.text.split("\n")
  if len(company_tile_sub_title_list) == 2:
    company_industry = company_tile_sub_title_list[0].strip()
    company_number_of_employees = company_tile_sub_title_list[1].strip()
  elif len(company_tile_sub_title_list) == 1:
    if "Employees" in company_tile_sub_title_list[0].strip():
      company_industry = ""
      company_number_of_employees = company_tile_sub_title_list[0].strip()
    else:
      company_industry = company_tile_sub_title_list[0].strip()
      company_number_of_employees = ""
  else:
    print("different length of sub title at ", company_name)
    print(company_tile_sub_title_list)
    company_industry = ""
    company_number_of_employees = ""
  company_tile_location = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/div[2]')
  company_headquarters_location = company_tile_location.text.strip()
  company_tile_reviews = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/div[4]//span[1]')
  company_reviews = company_tile_reviews.text.strip() + " Reviews"
  company_tile_salaries = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/div[4]//span[2]')
  company_salaries = company_tile_salaries.text.strip() + " Salaries"
  company_tile_jobs = driver.find_element(By.XPATH, '//a[contains(@data-test, "company-tile")]/div[2]/div[4]//span[3]')
  company_jobs = company_tile_jobs.text.strip() + " Jobs"
  return {
    "Company Name": company_name,
    "Industry": company_industry,
    "Headquarters Location": company_headquarters_location,
    "Number of Employees": company_number_of_employees,
    "Number of Reviews": company_reviews,
    "Number of Salaries": company_salaries,
    "Number of Jobs": company_jobs,
    "Glassdoor Company Title": company_glassdoor_title,
    "Glassdoor Overall Rating": company_glassdoor_overall_rating,
    "Glassdoor Company Page URL": company_glassdoor_page_url,
    "Glassdoor Company Logo URL": company_glassdoor_logo_url,
  }

In [6]:
def scrape_glassdoor_codes_by_companies_csv(csv_path: str, metadata_path: str = None):
  login_to_glassdoor()
  df = pd.read_csv(csv_path, encoding="utf-8")
  company_names = df["Company Name"]
  if metadata_path:
    metadata_df = pd.read_csv(metadata_path)
  else:
    metadata_df = pd.DataFrame(columns=["Company Name", "Industry", "Headquarters Location", "Number of Employees", "Number of Reviews", "Number of Salaries", "Number of Jobs", "Glassdoor Company Title", "Glassdoor Overall Rating", "Glassdoor Company Page URL", "Glassdoor Company Logo URL"])
  for name in tqdm(company_names):
    try:
      if metadata_path and name in metadata_df["Company Name"].values:
          pass
      else:
        metadata_df.loc[len(metadata_df)] = fetch_company_metadata(name)
    except Exception as e:
      metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)
      raise e
    sleep(1)
  metadata_df = metadata_df.sort_values("Company Name")
  metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)

### Reviews Metadata

In [7]:
def get_glassdoor_reviews_url_by_company_and_page_number(company_url: str, page_number: int = 1):
  company_name_in_url = re.search('Working-at-(.*)-EI', company_url).group(1)
  company_id_in_url = re.search('-EI_IE(.*)\.11', company_url).group(1)
  if page_number > 1:
    glassdoor_reviews_url = f"https://www.glassdoor.com/Reviews/{company_name_in_url}-Reviews-E{company_id_in_url}_P{page_number}.htm?sort.sortType=RD&sort.ascending=true&filter.iso3Language=eng"
  else:
    glassdoor_reviews_url = f"https://www.glassdoor.com/Reviews/{company_name_in_url}-Reviews-E{company_id_in_url}.htm?sort.sortType=RD&sort.ascending=true&filter.iso3Language=eng"
  return glassdoor_reviews_url

In [8]:
def get_number_of_reviews(company_url: str):
  company_reviews_url = get_glassdoor_reviews_url_by_company_and_page_number(company_url)

  repeat = 0
  while repeat < REPEAT_TIMES:
    repeat += 1
    try:
      driver.get(company_reviews_url)
      sleep(WAIT_TIME)
      pagination_footer = WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@data-test, "pagination-footer-text")]')))
      number_of_reviews = re.search(' ([^\s]*) Reviews', pagination_footer.text).group(1).replace(",", "").strip()
      repeat = REPEAT_TIMES
    except (WebDriverException, StaleElementReferenceException) as e:
      if repeat < REPEAT_TIMES:
        print("Error Message:", e.msg)
        print("Repeat:", repeat)
      else:
        raise e

  return int(number_of_reviews)

In [9]:
def get_start_review_number_for_year_in_page(company_url: str, page: int, year: int):
  company_reviews_url = get_glassdoor_reviews_url_by_company_and_page_number(company_url, page)

  repeat = 0
  while repeat < REPEAT_TIMES:
    repeat += 1
    try:
      driver.get(company_reviews_url)
      sleep(WAIT_TIME)
      elem = WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, " fb_reset") and contains(@id, "fb-root")]')))
      time_spans = driver.find_elements(By.XPATH, '//ol[contains(@class, "empReviews")]/li//span[contains(@class, "authorJobTitle")]')
      years = [int(re.search(', ([0-9]*) -', span.text).group(1)) for span in time_spans]
      repeat = REPEAT_TIMES
    except (WebDriverException, StaleElementReferenceException) as e:
      if repeat < REPEAT_TIMES:
        print("Error Message:", e.msg)
        print("Repeat:", repeat)
      else:
        raise e  
        
  years_indices_lt = [i for i, y in enumerate(years) if y < year]
  years_indices_eq = [i for i, y in enumerate(years) if y == year]
  years_indices_gt = [i for i, y in enumerate(years) if y > year]

  if len(years_indices_eq) > 0:
    return years_indices_eq[0] + 1
  elif len(years_indices_lt) > 0 and len(years_indices_gt) > 0:
    return -1
  elif len(years_indices_gt) > 0:
    return 0
  else: # len(years_indices_lt) > 0:
    return 11

In [10]:
def get_end_review_number_for_year_in_page(company_url: str, page: int, year: int):
  company_reviews_url = get_glassdoor_reviews_url_by_company_and_page_number(company_url, page)
  
  repeat = 0
  while repeat < REPEAT_TIMES:
    repeat += 1
    try:
      driver.get(company_reviews_url)
      sleep(WAIT_TIME)
      elem = WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, " fb_reset") and contains(@id, "fb-root")]')))
      time_spans = driver.find_elements(By.XPATH, '//ol[contains(@class, "empReviews")]/li//span[contains(@class, "authorJobTitle")]')
      years = [int(re.search(', ([0-9]*) -', span.text).group(1)) for span in time_spans]
      repeat = REPEAT_TIMES
    except (WebDriverException, StaleElementReferenceException) as e:
      if repeat < REPEAT_TIMES:
        print("Error Message:", e.msg)
        print("Repeat:", repeat)
      else:
        raise e
  
  years_indices_lt = [i for i, y in enumerate(years) if y < year]
  years_indices_eq = [i for i, y in enumerate(years) if y == year]
  years_indices_gt = [i for i, y in enumerate(years) if y > year]

  if len(years_indices_eq) > 0:
    return years_indices_eq[-1] + 1
  elif len(years_indices_lt) > 0 and len(years_indices_gt) > 0:
    return -1
  elif len(years_indices_gt) > 0:
    return 0
  else: # len(years_indices_lt) > 0:
    return 11

In [11]:
def get_start_review_page_number_for_year_by_company_url(company_url: str, year: int, sp: int = None, ep: int = None):
  login_to_glassdoor()
  number_of_reviews = get_number_of_reviews(company_url)
  number_of_pages = number_of_reviews//10
  if not sp:
    sp = 1
  if not ep:
    ep = number_of_pages
  lp, mp, rp = sp, ((ep-sp)//2)+sp ,ep
  li, mi, ri = -2, -2, -2
  start_dict = {}
  while lp <= rp:
    li = get_start_review_number_for_year_in_page(company_url, lp, year) # check first page
    # print("lp", lp, "li" , li)
    if li != 11:
      start_dict["start_page"], start_dict["start_index"] = lp, li
      break
      
    ri = get_start_review_number_for_year_in_page(company_url, rp, year) # check last page
    # print("rp", rp, "ri" , ri)
    if ri != 0 and ri != 1:
      start_dict["start_page"], start_dict["start_index"] = rp, ri
      break

    mi = get_start_review_number_for_year_in_page(company_url, mp, year) # check middle page
    # print("mp", mp, "mi" , mi)
    if (2 <= mi and mi <= 10) or mi == -1:
      start_dict["start_page"], start_dict["start_index"] = mp, mi
      break
    elif mi == 0:
      lp = lp + 1
      rp = mp - 1
    elif mi == 1:
      lp = lp + 1
      rp = mp
    else: # mi == 11
      lp = mp + 1
      if ri == 0:
        rp = rp - 1
    mp = (lp + rp)//2
  return start_dict

In [12]:
def get_end_review_page_number_for_year_by_company_url(company_url: str, year: int, sp: int = None, ep: int = None):
  login_to_glassdoor()
  number_of_reviews = get_number_of_reviews(company_url)
  number_of_pages = number_of_reviews//10
  if not sp:
    sp = 1
  if not ep:
    ep = number_of_pages
  lp, mp, rp = sp, ((ep-sp)//2)+sp ,ep
  li, mi, ri = -2, -2, -2
  end_dict = {}
  while lp <= rp:
    li = get_end_review_number_for_year_in_page(company_url, lp, year) # check first page
    # print("lp", lp, "li" , li)
    if li != 10 and li != 11:
      end_dict["end_page"], end_dict["end_index"] = lp, li
      break
      
    ri = get_end_review_number_for_year_in_page(company_url, rp, year) # check last page
    # print("rp", rp, "ri" , ri)
    if ri != 0:
      end_dict["end_page"], end_dict["end_index"] = rp, ri
      break

    mi = get_end_review_number_for_year_in_page(company_url, mp, year) # check middle page
    # print("mp", mp, "mi" , mi)
    if (1 <= mi and mi <= 9) or mi == -1:
      end_dict["end_page"], end_dict["end_index"] = mp, mi
      break
    elif mi == 10:
      lp = mp 
      rp = rp - 1
    elif mi == 11:
      lp = mp + 1
      rp = rp - 1
    else: # mi == 0
      if li == 11:
        lp = lp + 1
      rp = mp - 1
    mp = (lp + rp)//2
  return end_dict

In [13]:
def scrap_glassdoor_start_and_end_review_numbers_by_range(csv_path: str, start_year: int, end_year: int, metadata_path: str = None):
  login_to_glassdoor()
  df = pd.read_csv(csv_path, encoding="utf-8")
  if metadata_path:
    metadata_df = pd.read_csv(metadata_path)
  else:
    metadata_df = pd.DataFrame(columns=["Company Name", "Total Number of Reviews", "Start Year", "Page of First Review", "Index of First Review", "End Year", "Page of Last Review", "Index of Last Review", "Number of Reviews in Time Frame"])
  company_names = df["Company Name"]
  glassdoor_urls = df["Glassdoor Company Page URL"]
  for url, name in zip(tqdm(glassdoor_urls), company_names):
    try:
      if metadata_path and name in metadata_df["Company Name"].values:
        pass
      else:
        total_number_of_reviews = get_number_of_reviews(url)
        start_dict = get_start_review_page_number_for_year_by_company_url(url, start_year)
        print(start_dict)
        end_dict = get_end_review_page_number_for_year_by_company_url(url, end_year)
        print(end_dict)
        metadata_df.loc[len(metadata_df)] = {
          "Company Name": name,
          "Total Number of Reviews": total_number_of_reviews,
          "Start Year": start_year,
          "Page of First Review": start_dict["start_page"],
          "Index of First Review": start_dict["start_index"],
          "End Year": end_year,
          "Page of Last Review": end_dict["end_page"],
          "Index of Last Review": end_dict["end_index"],
          "Number of Reviews in Time Frame": end_dict["end_page"]*10 + end_dict["end_index"] - start_dict["start_page"]*10 - start_dict["start_index"] + 1
        }
        metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)
    except KeyboardInterrupt as e:
      metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)
      raise e
    except Exception as e:
      metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)
      raise e
  metadata_df = metadata_df.sort_values("Company Name")
  metadata_df.to_csv(f"data/SP500_glassdoor_metadata_{str(date.today())}.csv", index=False)

### Reviews

In [15]:
def scrape_glassdoor_reviews_in_page(page_url: str, company_name: str, start_index: int = 1, end_index: int = 10):
    repeat = 0

    while repeat < REPEAT_TIMES:
        reviews_df = pd.DataFrame(columns=["Company Name", "Total Score", "Work/Life Balance Score", "Culture & Values Score", "Diversity & Inclusion Score", "Career Opportunities Score", "Compensation and Benefits Score", "Senior Management Score", "Current/Former Employee", "Employment Length", "Review Title", "Review Date", "Job Description", "Recommend Level", "CEO Approval Level", "Business Outlook Level", "Pros", "Cons", "Helpful Rate"])
        repeat += 1
        try:
            driver.get(page_url)
            sleep(WAIT_TIME)
            elem = WebDriverWait(driver, WAIT_TIME).until(EC.presence_of_element_located((By.XPATH, '//div[contains(@class, " fb_reset") and contains(@id, "fb-root")]')))
            for i in range(start_index, end_index+1):
                review_dict = {}
                review_dict["Company Name"] = company_name
                total_score_div = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/div/div')
                review_dict["Total Score"] = STARS_CLASSES_DICT[total_score_div.get_attribute('class').split(" ")[0]]
                work_life_balance_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Work/Life Balance")]/following-sibling::div')
                review_dict["Work/Life Balance Score"] = STARS_CLASSES_DICT[work_life_balance_score_divs[0].get_attribute('class').split(" ")[0]] if len(work_life_balance_score_divs) == 1 else None
                culture_and_values_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Culture & Values")]/following-sibling::div')
                review_dict["Culture & Values Score"] = STARS_CLASSES_DICT[work_life_balance_score_divs[0].get_attribute('class').split(" ")[0]] if len(work_life_balance_score_divs) == 1 else None
                diversity_and_inclusion_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Diversity & Inclusion")]/following-sibling::div')
                review_dict["Diversity & Inclusion Score"] = STARS_CLASSES_DICT[diversity_and_inclusion_score_divs[0].get_attribute('class').split(" ")[0]] if len(diversity_and_inclusion_score_divs) == 1 else None
                career_opportunities_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Career Opportunities")]/following-sibling::div')
                review_dict["Career Opportunities Score"] = STARS_CLASSES_DICT[career_opportunities_score_divs[0].get_attribute('class').split(" ")[0]] if len(career_opportunities_score_divs) == 1 else None
                compensation_and_benefits_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Compensation and Benefits")]/following-sibling::div')
                review_dict["Compensation and Benefits Score"] = STARS_CLASSES_DICT[compensation_and_benefits_score_divs[0].get_attribute('class').split(" ")[0]] if len(compensation_and_benefits_score_divs) == 1 else None
                senior_management_score_divs = driver.find_elements(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/div/aside/div/div/ul/li/div[contains(text(), "Senior Management")]/following-sibling::div')
                review_dict["Senior Management Score"] = STARS_CLASSES_DICT[senior_management_score_divs[0].get_attribute('class').split(" ")[0]] if len(senior_management_score_divs) == 1 else None            
                employment_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[1]/div/span')
                review_dict["Current/Former Employee"] = employment_span.text.strip().split(",")[0].strip()
                review_dict["Employment Length"] = employment_span.text.strip().split(",")[1].strip() if len(employment_span.text.strip().split(",")) == 2 else None
                title_h2 = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[1]/h2')
                review_dict["Review Title"] = title_h2.text.strip()
                date_and_job_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[1]/span')
                review_dict["Review Date"] = date_and_job_span.text.strip().split("-")[0].strip()
                review_dict["Job Description"] = date_and_job_span.text.strip().split("-")[1].strip() if len(date_and_job_span.text.strip().split("-")) == 2 and len(date_and_job_span.text.strip().split("-")[1].strip()) > 0 else None
                recommend_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[1]/div/div/div/span[contains(text(), "Recommend")]/preceding-sibling::span')
                review_dict["Recommend Level"] = V_X_DICT[recommend_span.get_attribute('class').split(" ")[1]]
                ceo_approval_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[1]/div/div/div/span[contains(text(), "CEO Approval")]/preceding-sibling::span')
                review_dict["CEO Approval Level"] = V_X_DICT[ceo_approval_span.get_attribute('class').split(" ")[1]]
                buisness_outlook_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div[1]/div/div/div/div/span[contains(text(), "Business Outlook")]/preceding-sibling::span')
                review_dict["Business Outlook Level"] = V_X_DICT[buisness_outlook_span.get_attribute('class').split(" ")[1]]
                pros_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[2]//span[contains(@data-test, "pros")]')
                review_dict["Pros"] = pros_span.text.strip()
                cons_span = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[2]//span[contains(@data-test, "cons")]')
                review_dict["Cons"] = cons_span.text.strip()
                helpful_rate_div = driver.find_element(By.XPATH, f'//ol[contains(@class, "empReviews")]/li[{i}]//div[contains(@class, "gdReview")]/div[2]/div/div[2]/div[contains(@class, "common__EiReviewDetailsStyle__socialHelpfulcontainer pt-std")]')
                review_dict["Helpful Rate"] = int(helpful_rate_div.text.strip().split(" ")[0]) if 'Be' not in helpful_rate_div.text.strip().split(" ")[0] else 0
                reviews_df.loc[len(reviews_df)] = review_dict

            repeat = REPEAT_TIMES
        except (WebDriverException, StaleElementReferenceException, NoSuchElementException, TimeoutException) as e:
            if repeat < REPEAT_TIMES:
                print("Error Message:", e.msg)
                print("Repeat:", repeat)
            else:
                raise e
                    
    return reviews_df

In [16]:
def scrape_glassdoor_reviews(csv_path: str, metadata_path: str):
    login_to_glassdoor()
    df = pd.read_csv(csv_path)
    metadata_df = pd.read_csv(metadata_path)
    company_names = metadata_df["Company Name"]
    glassdoor_urls = df[df["Company Name"].isin(company_names)]["Glassdoor Company Page URL"]
    for url, name in zip(tqdm(glassdoor_urls), company_names):
        temp_reviews_dfs = []
        review_csv_path = f"data/reviews/{name}.csv"
        if os.path.exists(review_csv_path):
            reviews_df = pd.read_csv(review_csv_path)
        else:
            reviews_df = pd.DataFrame(columns=["Company Name", "Total Score", "Work/Life Balance Score", "Culture & Values Score", "Diversity & Inclusion Score", "Career Opportunities Score", "Compensation and Benefits Score", "Senior Management Score", "Current/Former Employee", "Employment Length", "Review Title", "Review Date", "Job Description", "Recommend Level", "CEO Approval Level", "Business Outlook Level", "Pros", "Cons", "Helpful Rate"])
        
        start_page = metadata_df[metadata_df["Company Name"] == name].reset_index(drop=True).loc[0, "Page of First Review"] + len(reviews_df)//10
        start_index = metadata_df[metadata_df["Company Name"] == name].reset_index(drop=True).loc[0, "Index of First Review"] if len(reviews_df) == 0 else len(reviews_df)%10
        end_page = metadata_df[metadata_df["Company Name"] == name].reset_index(drop=True).loc[0, "Page of Last Review"]
        end_index = metadata_df[metadata_df["Company Name"] == name].reset_index(drop=True).loc[0, "Index of Last Review"]
        for i, page in enumerate(range(start_page, end_page+1)):
            reviews_page_url = get_glassdoor_reviews_url_by_company_and_page_number(url, page)
            si =  1
            ei = 10
            if page == start_page:
                si = start_index
            if page == end_page:
                ei = end_index
            temp_reviews_dfs.append(scrape_glassdoor_reviews_in_page(reviews_page_url, name, start_index=si, end_index=ei))
            if i % 10 == 0:
                reviews_df = pd.concat([reviews_df] + temp_reviews_dfs)
                reviews_df.to_csv(review_csv_path, index=False)
                temp_reviews_dfs = []

        reviews_df = pd.concat([reviews_df] + temp_reviews_dfs)
        reviews_df.to_csv(review_csv_path, index=False)

### Clean Data

In [45]:
def clean_reviews(old_dir: str, new_dir: str, years: list = ['2017', '2018', '2019']):
    df = pd.DataFrame(columns=["Company Name", "Number of Reviews"])
    files = glob(os.path.join(old_dir, "*.csv"))
    for f in tqdm(files):
        company_name = '.'.join(f.split('\\')[-1].split('.')[0:-1]).strip()
        company_df = pd.read_csv(f)
        bool_series = company_df['Review Date'].apply(lambda x: x.split(',')[-1].strip()).isin(years)
        company_df = company_df.loc[bool_series].reset_index(drop=True)
        company_df.to_csv(os.path.join(new_dir, f"{company_name}.csv"), index=False)
        df.loc[len(df)] = {'Company Name': company_name, 'Number of Reviews': len(company_df)}
    df.to_csv('SP500_2020_reviews_metadata.csv', index=False)

## DB Utils

## Run

In [17]:
CSV_PATH = "data/SP500_2020_components.csv"
META_DATA_PATH = "data/SP500_glassdoor_metadata_2022-06-27.csv"
# scrape_glassdoor_codes_by_companies_csv(CSV_PATH)

In [18]:
CSV_PATH = "data/SP500_2020_glassdoor_metadata.csv"
META_DATA_PATH = "data/SP500_glassdoor_metadata_2022-06-27.csv"
# scrap_glassdoor_start_and_end_review_numbers_by_range(CSV_PATH, 2017, 2019, META_DATA_PATH)

In [19]:
CSV_PATH = "SP500_2020_glassdoor_metadata_part.csv"
META_DATA_PATH = "SP500_2020_reviews_glassdoor_metadata_part.csv"
# check_reviews_metadata(CSV_PATH, META_DATA_PATH)

In [None]:
CSV_PATH = "data/SP500_2020_glassdoor_metadata.csv"
META_DATA_PATH = "data/SP500_2020_reviews_glassdoor_metadata.csv"
# scrape_glassdoor_reviews(CSV_PATH, META_DATA_PATH)

In [46]:
OLD_DIR = "data/reviews_raw"
NEW_DIR = "data/reviews_clean"
# clean_reviews(OLD_DIR, NEW_DIR)

  0%|          | 0/492 [00:00<?, ?it/s]