In [None]:

# Setup
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd
from page_objects.login_page import LoginPage
from page_objects.companies_page import CompaniesPage
from page_objects.nav import Nav
from page_objects.company_details_page import CompanyDetailPage
from page_objects.utils import get_text_from_elem
import random
import os
from datetime import date
import logging
from functools import partial

options = Options()
options.add_argument("window-size=1200x600")

driver = webdriver.Chrome(options=options)

driver.get("https://www.glassdoor.sg")

# Output directory
COMPANY_REVIEWS_DATA = './data'

logging.basicConfig(filename=f'{COMPANY_REVIEWS_DATA}/crawl_reviews.log', encoding='utf-8', level=logging.INFO)

# Init page objects
login_page = LoginPage(driver)
nav = Nav(driver)
companies_page = CompaniesPage(driver)
companies_details_page = CompanyDetailPage(driver)

In [None]:
# Login
login_page.login()

In [None]:
# Route to 'Companies' page
nav.route_to_companies()

# Filter by location and job function in 'Companies' page
companies_page.filter_by_location('Singapore', 'Singapore (Singapore)')
companies_page.filter_by_job_function(['Engineering', 'Information Technology'])

In [None]:
csv_filename = 'company_reviews.csv'

def crawl_reviews():
    companies_details_page.change_tab('Reviews')
    time.sleep(random.randint(1,2))
    companies_details_page.toggle_filter()
    time.sleep(random.randint(1,2))
    companies_details_page.filter_by_location('option_N\\\\,217')
    time.sleep(random.randint(2,3))
    companies_details_page.filter_by_job_fn('option_1007')
    time.sleep(random.randint(2,3))

    df = None
    company_name = companies_details_page.get_emp_name()

    def save_company_reviews(elem, idx):
        nonlocal df
        data = {
            'company_name': company_name,
            'date_posted': get_text_from_elem(elem, 'span.review-details__review-details-module__reviewDate'),
            'rating': get_text_from_elem(elem, 'span.review-details__review-details-module__overallRating'),
            'review_title': get_text_from_elem(elem, 'h2[data-test="review-details-title"]'),
            'job_position': get_text_from_elem(elem, 'span.review-details__review-details-module__employee'),
            'job_location': get_text_from_elem(elem, 'span.review-details__review-details-module__location'),
            'pro_review': get_text_from_elem(elem, 'span[data-test="review-text-pros"]'),
            'con_review': get_text_from_elem(elem, 'span[data-test="review-text-cons"]'),
        }
        df = pd.concat([df, pd.DataFrame([data])]) if df is not None else pd.DataFrame([data])

    try:
        companies_details_page.iterate_pages(
            save_company_reviews,
            start=1,
            end=20,
            suppress_page_error=True
        )
    except Exception as e:
        # Error changing page, propagate error to companies_page.iterate_pages handler
        raise e
    finally:
        if df is not None:
            if os.path.isfile(f'{COMPANY_REVIEWS_DATA}/{csv_filename}'):
                df.to_csv(csv_filename, mode='a', header=False)
            else:
                df.to_csv(csv_filename)

In [None]:
def open_company_details(elem, idx):
    elem.click()
    time.sleep(random.randint(1,3))
    windows = driver.window_handles
    driver.switch_to.window(windows[1])
    crawl_reviews()
    driver.close()
    driver.switch_to.window(windows[0])
    time.sleep(random.randint(1,3))

In [None]:
logging.info('Started crawling company reviews.')
companies_page.iterate_pages(
    open_company_details,
    start=1,
    end=100,
    suppress_page_error=True
)