In [None]:
import json
import os
import random
import re
import traceback
from datetime import datetime
from time import sleep

import pandas as pd
import requests
from bs4 import BeautifulSoup as bs

In [None]:
def save_results(output_file, header, row):
    # Create a pandas dataframe with person name and result count as the row
    df = pd.DataFrame([row])

    # If file doesn't exist, we add a header to the file
    # otherwise, we simply append the data to the file.
    if not os.path.isfile(output_file):
        df.to_csv(output_file, mode='a', index=False,
                  header=header, encoding='utf-8-sig')
    else:
        df.to_csv(output_file, mode='a', index=False,
                  header=False, encoding='utf-8-sig')


def create_session():
    session = requests.Session()
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:75.0) Gecko/20100101 Firefox/75.0',
        "X-Requested-With": "XMLHttpRequest"
    }
    session.headers.update(headers)
    return session


def get_search_page(session, base_url, company):
    resp = session.get(base_url.format(company, company))
    soup = bs(resp.text, 'html5lib')
    scripts = soup.find_all('script', text=re.compile('window.gdGlobals'))
    str_json = scripts[0].get_text()
    search_url = str_json.split(
        "untranslatedUrl' : ")[-1].split('}')[0].strip()[1:-1]
    print(f"Search url found: {search_url}")
    return search_url


def get_english_review_count(session, link):
    resp = session.get(link)
    soup = bs(resp.text, 'html5lib')
    # scripts = soup.find_all(
    #     'script', text=re.compile('window.__APOLLO_STATE__'))
    # str_json = scripts[0].get_text().replace(
    #     'window.__APOLLO_STATE__=', '')[:-1].replace('\\<em>', '') \
    #     .replace('\\</em>', '')
    # json_obj = json.loads(str_json)
    # employer_reviews = None
    # for key in json_obj.keys():
    #     if '$ROOT_QUERY.employerReviews' in key:
    #         if 'filteredReviewsCountByLang' in json_obj[key].keys():
    #             employer_reviews = json_obj[key]
    #             break
    english_reviews, total_reviews = '', ''
    # if employer_reviews is not None:
    #     english_reviews = employer_reviews['filteredReviewsCount']
    #     total_reviews = employer_reviews['allReviewsCount']
    #     # print(english_reviews, total_reviews)
    reviews_sec = soup.find('div', class_=lambda x: x and x.startswith('zeroResults__ZeroResultsStyles__zeroResults'))
    reviews_sec = reviews_sec.find('button') if reviews_sec else None
    if reviews_sec is not None:
      english_reviews_text = reviews_sec.text.strip()
      if 'english' in english_reviews_text.lower():
        english_reviews = english_reviews_text.split('(')[-1].split(')')[0].split()[0]
    total_reviews_sec = soup.find('a',{'data-label':'Reviews'}).find('span', class_='num h2')
    total_reviews = total_reviews_sec.text.strip() if total_reviews_sec else None
    if total_reviews == '--':
      total_reviews = ''
    # print(english_reviews, total_reviews)
    return english_reviews, total_reviews


def get_all_ratings(session, company_id):
    base_url = 'https://www.glassdoor.com/api/employer/{}-rating.htm'
    resp = session.get(base_url.format(company_id))
    data = resp.json()
    overall_rating, comp_benefit_rating, culture_value_rating, career_opp_rating, \
        work_life_rating, senior_mgmt_rating = [None] * 6
    for entry in data['ratings']:
        if entry['type'] == 'overallRating':
            overall_rating = entry['value'] if entry['hasRating'] else None
        if entry['type'] == 'compAndBenefits':
            comp_benefit_rating = entry['value'] if entry['hasRating'] else None
        if entry['type'] == 'cultureAndValues':
            culture_value_rating = entry['value'] if entry['hasRating'] else None
        if entry['type'] == 'careerOpportunities':
            career_opp_rating = entry['value'] if entry['hasRating'] else None
        if entry['type'] == 'workLife':
            work_life_rating = entry['value'] if entry['hasRating'] else None
        if entry['type'] == 'seniorManagement':
            senior_mgmt_rating = entry['value'] if entry['hasRating'] else None
    return overall_rating, comp_benefit_rating, culture_value_rating, \
        career_opp_rating, work_life_rating, senior_mgmt_rating


def get_companies_list(session, search_url, company, company_alias, company_org):
    resp = session.get(search_url)
    soup = bs(resp.text, 'lxml')
    header = ['Company Name (Original)', 'Company Name (Found)',
              'Overall Rating', 'Comp & Benefits Rating', 'Culture & Values Rating',
              'Career Opportunities Rating', 'Work/Life Balance Rating',
              'Senior Management Rating', 'English Reviews', 'Total Reviews']
    # We get 2 types of results when searching for companies.
    # First type gives us a list to choose from
    # Second type directly takes us to homepage of company
    # Second occurs very rarely liek in case of United Airlines
    if 'Overview' not in search_url:
        search_results = soup.find('div', id='SearchResults')
        all_companies = search_results.find_all(
            'div', class_='single-company-result')
        for each_company in all_companies:
            company_name = each_company.find('h2').find('a').text.strip()
            company_id = each_company['data-emp-id']
            if company.lower() in company_name.lower():
                company_rating = each_company.find('span', class_='bigRating')
                company_rating = company_rating.text.strip() if company_rating else None
                company_reviews_link = 'https://www.glassdoor.com' + \
                    each_company.find('a', class_='reviews')['href']
                english_reviews, total_reviews = get_english_review_count(
                    session, company_reviews_link)
                overall_rating, comp_benefit_rating, culture_value_rating, \
                    career_opp_rating, work_life_rating, \
                    senior_mgmt_rating = get_all_ratings(session, company_id)
                data = [company_org, company_name, overall_rating,
                        comp_benefit_rating, culture_value_rating,
                        career_opp_rating, work_life_rating,
                        senior_mgmt_rating, english_reviews, total_reviews]
                print(data)
                save_results(output_file, header, data)
                break
    else:
        company_name = soup.find(
            'span', id='DivisionsDropdownComponent').text.strip()
        # print(company_name)
        company_id = soup.find('div', id='EmpBasicInfo')['data-emp-id']
        overall_rating, comp_benefit_rating, culture_value_rating, \
            career_opp_rating, work_life_rating, senior_mgmt_rating = get_all_ratings(
                session, company_id)
        company_rating = soup.find('span', class_='hidden rating')
        company_rating = company_rating.text if company_rating else None
        # print(company_rating)
        company_reviews_link = 'https://www.glassdoor.com' + \
            soup.find('a', class_='reviews')['href']
        english_reviews, total_reviews = get_english_review_count(
            session, company_reviews_link)
        data = [company_org, company_name, overall_rating, comp_benefit_rating,
                culture_value_rating, career_opp_rating, work_life_rating,
                senior_mgmt_rating, english_reviews, total_reviews]
        print(data)
        save_results(output_file, header, data)

In [None]:
output_file = '/content/glassdoor_stats_inc5000.csv'

# Read the companies along with industry from acsi_benchmarks file
df = pd.read_csv('/content/Inc v2.csv')
companies = df.iloc[:, [2, 0]].to_numpy()

# Base url to get the search url we'll be finding later
base_url = 'https://www.glassdoor.com/Reviews/company-reviews.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword={}&sc.keyword={}&locT=&locId=&jobType='
# print(companies)
session = create_session()
for index, (industry, company) in enumerate(companies):
    try:
        company_org = company
        company = company.split('(')[0].strip()
        company_alias = company
        if industry == 'Airlines':
            company_alias = company_alias + ' ' + industry
        print(f"\nWorking on {index} / {company_alias}")
        # Get the search url that we see when we search in browser.
        # Can't directly search since it's generated dynamically
        search_url = get_search_page(session, base_url, company_alias)
        # Get proper company match and write to results
        get_companies_list(session, search_url, company,
                            company_alias, company_org)
    except Exception:
        print(traceback.format_exc())
        session = create_session()
    sleep(random.randint(2, 5))


In [None]:
from google.colab import files
files.download('/content/glassdoor_stats_inc5000.csv')