In [None]:
from selenium.common.exceptions import (NoSuchElementException, ElementClickInterceptedException,
WebDriverException, StaleElementReferenceException)
from selenium import webdriver

import time

import pandas as pd
import numpy as np

In [None]:
path = '/usr/bin/geckodriver'

In [None]:
def get_jobs(keyword, num_jobs, path, sleep_time):
    
    '''
    Gathers jobs as a dataframe, scraped from Glassdoor
    '''

    options = webdriver.FirefoxOptions()
    
    driver = webdriver.Firefox(executable_path=path, options=options)
    driver.set_window_size(1120, 1000)

    url = 'https://www.glassdoor.com/Job/jobs.htm?&sc.keyword=" ' + keyword
    
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  
        
        time.sleep(sleep_time)

        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass
        except NoSuchElementException:
            pass
        except StaleElementReferenceException:
            pass

        time.sleep(.1)

        try:
            driver.find_element_by_id("prefix__icon-close-1").click()
        except NoSuchElementException:
            pass
        except StaleElementReferenceException:
            pass

        job_buttons = driver.find_elements_by_class_name("jl")
        for job_button in job_buttons:  
    
            if len(jobs) != 0:
                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break
            
            try:
                job_button.click()
                time.sleep(1.5)
                collected_successfully = False
            except StaleElementReferenceException:
                pass

            selectors = [
                        './/div[@class = "css-87uc0g e1tk4kwz1"]',
                        './/div[@class = "css-56kyx5 e1tk4kwz5"]',
                        './/div[@class = "css-1vg6q84 e1tk4kwz4"]',
                        './/span[@class = "css-1m5m32b e1tk4kwz2"]',
                        './/span[@class = "css-56kyx5 css-16kxj2j e1wijj242"]',
                        './/div[@class = "jobDescriptionContent desc"]',
                        './/div[@class = "infoEntity"]//label[text() = "Founded"]//following-sibling::*',
                        './/div[@class = "infoEntity"]//label[text() = "Size"]//following-sibling::*',
                        './/div[@class = "infoEntity"]//label[text() = "Industry"]//following-sibling::*',
                        './/div[@class = "infoEntity"]//label[text() = "Sector"]//following-sibling::*',
                        './/div[@class = "infoEntity"]//label[text() = "Type"]//following-sibling::*',
                        './/div[@class = "infoEntity"]//label[text() = "Revenue"]//following-sibling::*'
                    ]
            
            company,location, title, rating, salary, description, founded, size, industry, sector,\
            company_type, revenue = [-1 for i in range(12)]

            variables = [company, location, title, rating, salary, description, founded, size, industry,
                         sector, company_type, revenue]
            
            while not collected_successfully:
                to_assign = []
                try:
                    for selector, variable in zip(selectors, variables):
                        try:
                            variable =  driver.find_element_by_xpath(selector).text
                        except NoSuchElementException:
                            variable = np.nan
                        to_assign.append(variable)
                    collected_successfully = True
                except:
                    time.sleep(1)

            jobs.append({"Company" : to_assign[0],
                         "Location": to_assign[1],
                         "Title" : to_assign[2],
                         "Rating" : to_assign[3],
                         "Salary" : to_assign[4],
                         "Description" : to_assign[5],
                         "Founded" : to_assign[6],
                         "Size" : to_assign[7],
                         "Industry" : to_assign[8],
                         "Sector" : to_assign[9],
                         "Type" : to_assign[10],
                         "Revenue" : to_assign[11]})

        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
            try:
                driver.find_element_by_xpath("//button[contains(text(), 'Retry your search')]").click()
            except NoSuchElementException:
                pass
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)

In [None]:
number_of_jobs = int(input("Enter the number of jobs to scrape: "))

start = time.time()

df = get_jobs('data scientist',number_of_jobs,path,1)

end = time.time()

print("Scraping {} jobs took {} seconds".format(number_of_jobs,round((end - start),2)))

In [None]:
df.to_csv('glassdoor data scientist salary.csv', index = False)