In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import math

exps = (NoSuchElementException, StaleElementReferenceException)

In [2]:
def get_jobs(keyword, pages, verbose=False):
    
    """Method to collect job listings from Glassdoor.
        
        Args: 
            keyword (string): the name of the job title for which you want to collect jobs
            pages (int): the number of pages you want to collect(around 30 jobs/page)
        
        Returns: 
            pandas.dataframe: a dataframe with all the jobs and their details
    
    """
    
    options = webdriver.ChromeOptions()
    
    #give path to chromedriver
    exec_path_mac = '/Users/mahmoudhamra/Dropbox/GitHub Projects/da_salary_proj/scraping-glassdoor-selenium-master/chromedriver'
    exec_path_pc = 'C:/Users/mahmo/Dropbox/GitHub Projects/da_salary_proj/chromedriver'
    driver = webdriver.Chrome(executable_path=exec_path_pc, options=options)
    

    driver.set_window_size(1120, 1000)
    url = 'https://www.glassdoor.com/Job/' + keyword.split()[0] + '-' + keyword.split()[1] + '-jobs-SRCH_KO0,12.htm?includeNoSalaryJobs=true&sortBy=date_desc'    
    driver.get(url)
    df = []    
    
    #loop through pages
    for i in range(pages):
        
        #if more than one page, load and get the next pages
        if i>=1:
            url2 = 'https://www.glassdoor.com/Job/' + keyword.split()[0] + '-' + keyword.split()[1] + '-jobs-SRCH_KO0,12_IP' + str(i+1) + '.htm?includeNoSalaryJobs=true&sortBy=date_desc'
            driver.get(url2)
        else:
            pass
        
        driver.implicitly_wait(1)
        
        #collect job listings in the page (30 jobs/page)
        jobs = driver.find_elements_by_xpath('.//*[@id="MainCol"]/div[1]/ul/li')
        
        for job in jobs:
            
            print("---------------------------")
            
            # click on job 
            job.click()
            print("JOB BUTTON CLICKED")
            
            #if sign up window pops up close it
            try:
                driver.find_element_by_css_selector('[alt="Close"]').click()  #clicking to the X.
                print("clicked the X button")
            except NoSuchElementException:
                pass

            
            #get job_title
            try:
                job_title = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[2]').text
            except exps:
                job_title = np.nan
                
            #get rating
            try:
                rating = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[1]/span').text
            except exps:
                rating = np.nan
            
            #get salary_estimate  
            try:
                salary_estimate = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[4]/span').text
            except exps:
                salary_estimate = np.nan
            
            #get location
            try:
                location = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[3]').text
            except exps:
                location = np.nan
                                

            #Looping over Company Overview section and appending information to comp_info list
            comp_info = []
            elems = driver.find_elements_by_xpath('.//div[@id="EmpBasicInfo"]//div//div[@class="d-flex flex-wrap"]//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"]')
            for element in elems:
                try:
                    driver.implicitly_wait(5)
                    #taking key value pairs ex: 'size', 1001 to 5000 Employees etc.
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"]').text)
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-i9gxme e1pvx6aw2"]').text)

                except StaleElementReferenceException:
                    print("##################StaleElementReferenceException##################")
                        
                    
            # Convert comp_info list to dictionary
            it = iter(comp_info)
            comp_info = dict(zip(it, it))
            
           
            # take values from comp_info dict
            try:
                size = comp_info['Size']
            except KeyError:
                size = "Unknown"
                
                            
            try:
                founded = comp_info['Founded']
            except KeyError:
                founded = np.nan
                

            try:
                type_of_ownership = comp_info['Type']
            except KeyError:
                type_of_ownership = np.nan
                

            try:
                industry = comp_info['Industry']
            except KeyError:
                industry = np.nan
                

            try:
                sector = comp_info['Sector']
            except KeyError:
                sector = np.nan
                

            try:
                revenue = comp_info['Revenue']
            except KeyError:
                revenue = np.nan
                
            if verbose:
                print('job_title:' , job_title)
                print('salary_estimate:', salary_estimate)
                print('location:', location)
                print('rating:', rating)
                print('size:', size)
                print('founded:', founded)
                print('type_of_ownership:', type_of_ownership)
                print('industry:', industry)
                print('sector:', sector)
                print('revenue:', revenue)
                
                
            print("---------------------------")
            
            #append job details to df
            df.append({
            "job_title": job_title,
            "salary_estimate": salary_estimate,
            "location": location,
            "rating" : rating,
            "size" : size,
            "founded" : founded,
            "type_of_ownership" : type_of_ownership,
            "industry" : industry,
            "sector" : sector,
            "revenue" : revenue})
         
    return pd.DataFrame(df)  #This line converts the dictionary object into a pandas DataFrame  
        
    

In [3]:
df = get_jobs('data analyst', 1, verbose= True)

---------------------------
JOB BUTTON CLICKED
clicked the X button
job_title: Healthcare Data Analyst
salary_estimate: nan
location: New York, NY
rating: 3.4
size: 5001 to 10000 Employees
founded: 1993
type_of_ownership: Nonprofit Organization
industry: Insurance Carriers
sector: Insurance
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Master Data Reporting Analyst
salary_estimate: $51K - $108K (Glassdoor est.)
location: Wilton, CT
rating: 4.1
size: 10000+ Employees
founded: 1984
type_of_ownership: Company - Public
industry: Electrical & Electronic Manufacturing
sector: Manufacturing
revenue: $10+ billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: CRA Data Analyst
salary_estimate: $43K - $78K (Glassdoor est.)
location: Miami Lakes, FL
rating: 3.1
size: 1001 to 5000 Employees
founded: 2009
type_of_ownership: Company - Public
industry: Banks & Credit Unions
sector: F

JOB BUTTON CLICKED
job_title: Digital Engagement Data Analyst/Admin
salary_estimate: $50K - $59K (Glassdoor est.)
location: Ogden, UT
rating: 3.8
size: 1001 to 5000 Employees
founded: 1939
type_of_ownership: Nonprofit Organization
industry: Investment Banking & Asset Management
sector: Finance
revenue: $100 to $500 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: $46K - $75K (Glassdoor est.)
location: Dallas, TX
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Metrics & Data Analytics Analyst
salary_estimate: nan
location: Washington, DC
rating: 3.8
size: 10000+ Employees
founded: 1996
type_of_ownership: Subsidiary or Business Segment
industry: IT Services
sector: Information Technology
revenue: $10+ billion (USD)
---------------------------
---------------------------
J

In [4]:
df.sample(10)

Unnamed: 0,job_title,salary_estimate,location,rating,size,founded,type_of_ownership,industry,sector,revenue
19,Test Analyst - ETL/Data Warehouse/BI Reporting...,$65K - $76K (Glassdoor est.),"Chicago, IL",3.4,10000+ Employees,1981,Company - Public,IT Services,Information Technology,$10+ billion (USD)
15,Data Analyst,,"Louisville, KY",4.1,10000+ Employees,1966,Company - Public,IT Services,Information Technology,$1 to $2 billion (USD)
21,Data Analyst & SQL,,"Richardson, TX",3.4,10000+ Employees,1981,Company - Public,IT Services,Information Technology,$10+ billion (USD)
27,Metrics & Data Analytics Analyst,,"Washington, DC",3.8,10000+ Employees,1996,Subsidiary or Business Segment,IT Services,Information Technology,$10+ billion (USD)
4,Data Analyst,,"Sugar Land, TX",3.4,1001 to 5000 Employees,1992,Company - Private,Consulting,Business Services,$100 to $500 million (USD)
11,Data Analyst,,"Fishers, IN",3.7,1001 to 5000 Employees,2002,Company - Private,Consulting,Business Services,$500 million to $1 billion (USD)
6,Business and Data Intelligence Analyst,,"Omaha, NE",3.8,1001 to 5000 Employees,2010,Company - Private,Health Care Services & Hospitals,Health Care,$50 to $100 million (USD)
25,Digital Engagement Data Analyst/Admin,$50K - $59K (Glassdoor est.),"Ogden, UT",3.8,1001 to 5000 Employees,1939,Nonprofit Organization,Investment Banking & Asset Management,Finance,$100 to $500 million (USD)
14,Financial Data Analyst,$65K - $79K (Employer est.),"Pleasant Prairie, WI",3.6,5001 to 10000 Employees,1980,Company - Private,Industrial Manufacturing,Manufacturing,Unknown / Non-Applicable
2,CRA Data Analyst,$43K - $78K (Glassdoor est.),"Miami Lakes, FL",3.1,1001 to 5000 Employees,2009,Company - Public,Banks & Credit Unions,Finance,$10+ billion (USD)


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   job_title          30 non-null     object
 1   salary_estimate    16 non-null     object
 2   location           30 non-null     object
 3   rating             26 non-null     object
 4   size               30 non-null     object
 5   founded            25 non-null     object
 6   type_of_ownership  26 non-null     object
 7   industry           25 non-null     object
 8   sector             25 non-null     object
 9   revenue            26 non-null     object
dtypes: object(10)
memory usage: 2.5+ KB


In [6]:
df.to_csv('jobs5', index=False)