In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, WebDriverException, StaleElementReferenceException
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [2]:
path = '/usr/bin/chromedriver'

In [9]:
def get_jobs(keyword, num_jobs, verbose, path, sleep_time):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
#     options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path=path, options=options)
    driver.set_window_size(1120, 1000)

    url = 'https://www.glassdoor.com/Job/jobs.htm?&sc.keyword=" ' + keyword + ' "&suggestCount=0&suggestChosen=false&clickSource=searchBtn&jobType=&context=Jobs&dropdown=0'
    
    driver.get(url)
    
#     element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "AddToCartText"))) # to check presence of element
#     element = WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.ID, 'AddToCartText'))) # to check if element is clickable or not
#     element.click()
        
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Let the page load. Change this number based on your internet speed.
        #Or, wait until the webpage is loaded, instead of hardcoding it.
        time.sleep(sleep_time)

        #Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element_by_class_name("selected").click()
        except ElementClickInterceptedException:
            pass
        except StaleElementReferenceException:
            pass

        time.sleep(.1)

        try:
            driver.find_element_by_id("prefix__icon-close-1").click()  #clicking to the X.
        except NoSuchElementException:
            pass
        except StaleElementReferenceException:
            pass
        
        #Going through each job in this page
        job_buttons = driver.find_elements_by_class_name("jl")  #jl for Job Listing. These are the buttons we're going to click.
        for job_button in job_buttons:  
            
            if len(jobs) != 0:
                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break

            try:
                job_button.click()
                time.sleep(1.5)
                collected_successfully = False
            except StaleElementReferenceException:
                pass
            
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_xpath('.//div[@class="employerName"]').text
                    location = driver.find_element_by_xpath('.//div[@class="location"]').text
                    job_title = driver.find_element_by_xpath('.//div[contains(@class, "title")]').text
                    job_description = driver.find_element_by_xpath('.//div[@class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    time.sleep(5)

            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="css-1uyte9r css-hca4ks e1wijj242"]').text
            except NoSuchElementException:
                salary_estimate = -1 #You need to set a "not found value. It's important."
            except StaleElementReferenceException:
                salary_estimate = -1
            
            try:
                rating = driver.find_element_by_xpath('.//span[@class="rating"]').text
            except NoSuchElementException:
                rating = -1 #You need to set a "not found value. It's important."
            except StaleElementReferenceException:
                rating = -1

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))

            #Going to the Company tab...
            #clicking on this:
            #<div class="tab" data-tab-type="overview"><span>Company</span></div>
            try:
                driver.find_element_by_xpath('.//div[@class="tab" and @data-tab-type="overview"]').click()

                try:
                    size = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Size"]//following-sibling::*').text
                except NoSuchElementException:
                    size = -1
                except StaleElementReferenceException:
                    size = -1
                except WebDriverException:
                    size = -1


                try:
                    founded = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Founded"]//following-sibling::*').text
                except NoSuchElementException:
                    founded = -1
                except StaleElementReferenceException:
                    founded = -1
                except WebDriverException:
                    founded = -1
                    
                try:
                    type_of_ownership = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Type"]//following-sibling::*').text
                except NoSuchElementException:
                    type_of_ownership = -1
                except StaleElementReferenceException:
                    type_of_ownership = -1
                except WebDriverException:
                    type_of_ownership = -1

                try:
                    industry = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Industry"]//following-sibling::*').text
                except NoSuchElementException:
                    industry = -1
                except StaleElementReferenceException:
                    industry = -1
                except WebDriverException:
                    industry = -1

                try:
                    sector = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Sector"]//following-sibling::*').text
                except NoSuchElementException:
                    sector = -1
                except StaleElementReferenceException:
                    sector = -1
                except WebDriverException:
                    sector = -1
                
                try:
                    revenue = driver.find_element_by_xpath('.//div[@class="infoEntity"]//label[text()="Revenue"]//following-sibling::*').text
                except NoSuchElementException:
                    revenue = -1
                except StaleElementReferenceException:
                    revenue = -1
                except WebDriverException:
                    revenue = -1

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1

                
            except WebDriverException:
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1
                
            except StaleElementReferenceException:
                size = -1
                founded = -1
                type_of_ownership = -1
                industry = -1
                sector = -1
                revenue = -1
        
                
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            #add job to jobs

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//li[@class="next"]//a').click()
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break

    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame."""

In [13]:
number_of_jobs = int(input("Enter the number of jobs to scrape: "))

start = time.time()

df = get_jobs('data scientist',number_of_jobs,False,path,1)

end = time.time()

print("Scraping {} jobs took {} seconds".format(number_of_jobs,round((end - start),2)))

Enter the number of jobs to scrape: 2500
Progress: 1/2500
Progress: 2/2500
Progress: 3/2500
Progress: 4/2500
Progress: 5/2500
Progress: 6/2500
Progress: 7/2500
Progress: 8/2500
Progress: 9/2500
Progress: 10/2500
Progress: 11/2500
Progress: 12/2500
Progress: 13/2500
Progress: 14/2500
Progress: 15/2500
Progress: 16/2500
Progress: 17/2500
Progress: 18/2500
Progress: 19/2500
Progress: 20/2500
Progress: 21/2500
Progress: 22/2500
Progress: 23/2500
Progress: 24/2500
Progress: 25/2500
Progress: 26/2500
Progress: 27/2500
Progress: 28/2500
Progress: 29/2500
Progress: 30/2500
Progress: 31/2500
Progress: 32/2500
Progress: 33/2500
Progress: 34/2500
Progress: 35/2500
Progress: 36/2500
Progress: 37/2500
Progress: 38/2500
Progress: 39/2500
Progress: 40/2500
Progress: 41/2500
Progress: 42/2500
Progress: 43/2500
Progress: 44/2500
Progress: 45/2500
Progress: 46/2500
Progress: 47/2500
Progress: 48/2500
Progress: 49/2500
Progress: 50/2500
Progress: 51/2500
Progress: 52/2500
Progress: 53/2500
Progress: 54/2

Progress: 436/2500
Progress: 437/2500
Progress: 438/2500
Progress: 439/2500
Progress: 440/2500
Progress: 441/2500
Progress: 442/2500
Progress: 443/2500
Progress: 444/2500
Progress: 445/2500
Progress: 446/2500
Progress: 447/2500
Progress: 448/2500
Progress: 449/2500
Progress: 450/2500
Progress: 451/2500
Progress: 452/2500
Progress: 453/2500
Progress: 454/2500
Progress: 455/2500
Progress: 456/2500
Progress: 457/2500
Progress: 458/2500
Progress: 459/2500
Progress: 460/2500
Progress: 461/2500
Progress: 462/2500
Progress: 463/2500
Progress: 464/2500
Progress: 465/2500
Progress: 466/2500
Progress: 467/2500
Progress: 468/2500
Progress: 469/2500
Progress: 470/2500
Progress: 471/2500
Progress: 472/2500
Progress: 473/2500
Progress: 474/2500
Progress: 475/2500
Progress: 476/2500
Progress: 477/2500
Progress: 478/2500
Progress: 479/2500
Progress: 480/2500
Progress: 481/2500
Progress: 482/2500
Progress: 483/2500
Progress: 484/2500
Progress: 485/2500
Progress: 486/2500
Progress: 487/2500
Progress: 48

Progress: 868/2500
Progress: 869/2500
Progress: 870/2500
Progress: 871/2500
Progress: 872/2500
Progress: 873/2500
Progress: 874/2500
Progress: 875/2500
Progress: 876/2500
Progress: 877/2500
Progress: 878/2500
Progress: 879/2500
Progress: 880/2500
Progress: 881/2500
Progress: 882/2500
Progress: 883/2500
Progress: 884/2500
Progress: 885/2500
Progress: 886/2500
Progress: 887/2500
Progress: 888/2500
Progress: 889/2500
Progress: 890/2500
Progress: 891/2500
Progress: 892/2500
Progress: 893/2500
Progress: 894/2500
Progress: 895/2500
Progress: 896/2500
Progress: 897/2500
Progress: 898/2500
Progress: 899/2500
Progress: 900/2500
Progress: 901/2500
Progress: 902/2500
Progress: 903/2500
Progress: 904/2500
Progress: 905/2500
Progress: 906/2500
Progress: 907/2500
Progress: 908/2500
Progress: 909/2500
Progress: 910/2500
Progress: 911/2500
Progress: 912/2500
Progress: 913/2500
Progress: 914/2500
Progress: 915/2500
Progress: 916/2500
Progress: 917/2500
Progress: 918/2500
Progress: 919/2500
Progress: 92

Progress: 1285/2500
Progress: 1286/2500
Progress: 1287/2500
Progress: 1288/2500
Progress: 1289/2500
Progress: 1290/2500
Progress: 1291/2500
Progress: 1292/2500
Progress: 1293/2500
Progress: 1294/2500
Progress: 1295/2500
Progress: 1296/2500
Progress: 1297/2500
Progress: 1298/2500
Progress: 1299/2500
Progress: 1300/2500
Progress: 1301/2500
Progress: 1302/2500
Progress: 1303/2500
Progress: 1304/2500
Progress: 1305/2500
Progress: 1306/2500
Progress: 1307/2500
Progress: 1308/2500
Progress: 1309/2500
Progress: 1310/2500
Progress: 1311/2500
Progress: 1312/2500
Progress: 1313/2500
Progress: 1314/2500
Progress: 1315/2500
Progress: 1316/2500
Progress: 1317/2500
Progress: 1318/2500
Progress: 1319/2500
Progress: 1320/2500
Progress: 1321/2500
Progress: 1322/2500
Progress: 1323/2500
Progress: 1324/2500
Progress: 1325/2500
Progress: 1326/2500
Progress: 1327/2500
Progress: 1328/2500
Progress: 1329/2500
Progress: 1330/2500
Progress: 1331/2500
Progress: 1332/2500
Progress: 1333/2500
Progress: 1334/2500


Progress: 1695/2500
Progress: 1696/2500
Progress: 1697/2500
Progress: 1698/2500
Progress: 1699/2500
Progress: 1700/2500
Progress: 1701/2500
Progress: 1702/2500
Progress: 1703/2500
Progress: 1704/2500
Progress: 1705/2500
Progress: 1706/2500
Progress: 1707/2500
Progress: 1708/2500
Progress: 1709/2500
Progress: 1710/2500
Progress: 1711/2500
Progress: 1712/2500
Progress: 1713/2500
Progress: 1714/2500
Progress: 1715/2500
Progress: 1716/2500
Progress: 1717/2500
Progress: 1718/2500
Progress: 1719/2500
Progress: 1720/2500
Progress: 1721/2500
Progress: 1722/2500
Progress: 1723/2500
Progress: 1724/2500
Progress: 1725/2500
Progress: 1726/2500
Progress: 1727/2500
Progress: 1728/2500
Progress: 1729/2500
Progress: 1730/2500
Progress: 1731/2500
Progress: 1732/2500
Progress: 1733/2500
Progress: 1734/2500
Progress: 1735/2500
Progress: 1736/2500
Progress: 1737/2500
Progress: 1738/2500
Progress: 1739/2500
Progress: 1740/2500
Progress: 1741/2500
Progress: 1742/2500
Progress: 1743/2500
Progress: 1744/2500


Progress: 2105/2500
Progress: 2106/2500
Progress: 2107/2500
Progress: 2108/2500
Progress: 2109/2500
Progress: 2110/2500
Progress: 2111/2500
Progress: 2112/2500
Progress: 2113/2500
Progress: 2114/2500
Progress: 2115/2500
Progress: 2116/2500
Progress: 2117/2500
Progress: 2118/2500
Progress: 2119/2500
Progress: 2120/2500
Progress: 2121/2500
Progress: 2122/2500
Progress: 2123/2500
Progress: 2124/2500
Progress: 2125/2500
Progress: 2126/2500
Progress: 2127/2500
Progress: 2128/2500
Progress: 2129/2500
Progress: 2130/2500
Progress: 2131/2500
Progress: 2132/2500
Progress: 2133/2500
Progress: 2134/2500
Progress: 2135/2500
Progress: 2136/2500
Progress: 2137/2500
Progress: 2138/2500
Progress: 2139/2500
Progress: 2140/2500
Progress: 2141/2500
Progress: 2142/2500
Progress: 2143/2500
Progress: 2144/2500
Progress: 2145/2500
Progress: 2146/2500
Progress: 2147/2500
Progress: 2148/2500
Progress: 2149/2500
Progress: 2150/2500
Progress: 2151/2500
Progress: 2152/2500
Progress: 2153/2500
Progress: 2154/2500


In [15]:
df.to_csv('glassdoor data scientist salary.csv', index = False)