In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import math

exps = (NoSuchElementException, StaleElementReferenceException)

In [2]:
def get_jobs(keyword, pages, verbose=False):
    
    """Method to collect job listings from Glassdoor.
        
        Args: 
            keyword (string): the name of the job title for which you want to collect jobs
            pages (int): the number of pages you want to collect(around 30 jobs/page)
        
        Returns: 
            pandas.dataframe: a dataframe with all the jobs and their details
    
    """
    
    options = webdriver.ChromeOptions()
    
    #give path to chromedriver
    exec_path_mac = '/Users/mahmoudhamra/Dropbox/GitHub Projects/da_salary_proj/scraping-glassdoor-selenium-master/chromedriver'
    exec_path_pc = 'C:/Users/mahmo/Dropbox/GitHub Projects/da_salary_proj/chromedriver'
    driver = webdriver.Chrome(executable_path=exec_path_pc, options=options)
    

    driver.set_window_size(1120, 1000)
    url = 'https://www.glassdoor.com/Job/' + keyword.split()[0] + '-' + keyword.split()[1] + '-jobs-SRCH_KO0,12.htm?includeNoSalaryJobs=true&sortBy=date_desc'    
    driver.get(url)
    df = []    
    
    #loop through pages
    for i in range(pages):
        
        #if more than one page, load and get the next pages
        if i>=1:
            url = 'https://www.glassdoor.com/Job/data-analyst-jobs-SRCH_KO0,12_IP' + str(i+1) + '.htm?includeNoSalaryJobs=true&sortBy=date_desc'
            driver.get(url)
        else:
            pass
        
        driver.implicitly_wait(1)
        
        #collect job listings in the page (30 jobs/page)
        jobs = driver.find_elements_by_xpath('.//*[@id="MainCol"]/div[1]/ul/li')
        
        for job in jobs:
            
            print("---------------------------")
            
            # click on job 
            job.click()
            print("JOB BUTTON CLICKED")
            
            #if sign up window pops up close it
            try:
                driver.find_element_by_css_selector('[alt="Close"]').click()  #clicking to the X.
                print("clicked the X button")
            except NoSuchElementException:
                pass

            
            #get job_title
            try:
                job_title = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[2]').text
            except exps:
                job_title = np.nan
                
            #get rating
            try:
                rating = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[1]/span').text
            except exps:
                rating = np.nan
            
            #get salary_estimate  
            try:
                salary_estimate = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[4]/span').text
            except exps:
                salary_estimate = np.nan
            
            #get location
            try:
                location = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[3]').text
            except exps:
                location = np.nan
                                

            #Looping over Company Overview section and appending information to comp_info list
            comp_info = []
            elems = driver.find_elements_by_xpath('.//div[@id="EmpBasicInfo"]//div//div[@class="d-flex flex-wrap"]//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"]')
            for element in elems:
                try:
                    driver.implicitly_wait(5)
                    #taking key value pairs ex: 'size', 1001 to 5000 Employees etc.
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"]').text)
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-i9gxme e1pvx6aw2"]').text)

                except StaleElementReferenceException:
                    print("##################StaleElementReferenceException##################")
                        
                    
            # Convert comp_info list to dictionary
            it = iter(comp_info)
            comp_info = dict(zip(it, it))
            
           
            # take values from comp_info dict
            try:
                size = comp_info['Size']
            except KeyError:
                size = "Unknown"
                
                            
            try:
                founded = comp_info['Founded']
            except KeyError:
                founded = np.nan
                

            try:
                type_of_ownership = comp_info['Type']
            except KeyError:
                type_of_ownership = np.nan
                

            try:
                industry = comp_info['Industry']
            except KeyError:
                industry = np.nan
                

            try:
                sector = comp_info['Sector']
            except KeyError:
                sector = np.nan
                

            try:
                revenue = comp_info['Revenue']
            except KeyError:
                revenue = np.nan
                
            if verbose:
                print('job_title:' , job_title)
                print('salary_estimate:', salary_estimate)
                print('location:', location)
                print('rating:', rating)
                print('size:', size)
                print('founded:', founded)
                print('type_of_ownership:', type_of_ownership)
                print('industry:', industry)
                print('sector:', sector)
                print('revenue:', revenue)
                
                
            print("---------------------------")
            
            #append job details to df
            df.append({
            "job_title": job_title,
            "salary_estimate": salary_estimate,
            "location": location,
            "rating" : rating,
            "size" : size,
            "founded" : founded,
            "type_of_ownership" : type_of_ownership,
            "industry" : industry,
            "sector" : sector,
            "revenue" : revenue})
         
    return pd.DataFrame(df)  #This line converts the dictionary object into a pandas DataFrame  
        
    

In [3]:
df = get_jobs("data analyst", 3, verbose= True)

---------------------------
JOB BUTTON CLICKED
clicked the X button
job_title: Data Analyst
salary_estimate: $71K - $117K (Glassdoor est.)
location: Fremont, CA
rating: 3.8
size: 501 to 1000 Employees
founded: 2008
type_of_ownership: Company - Private
industry: IT Services
sector: Information Technology
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Wealth Management Reporting and Data Analyst
salary_estimate: $89K - $129K (Glassdoor est.)
location: Sheldonville, MA
rating: 3.6
size: 10000+ Employees
founded: 1828
type_of_ownership: Company - Public
industry: Banks & Credit Unions
sector: Finance
revenue: $5 to $10 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: NAM Card AML Oversight Data Analytics Lead Analyst
salary_estimate: $63K - $106K (Glassdoor est.)
location: Urbandale, IA
rating: 3.9
size: 10000+ Employees
founded: 1812
type_of_ownership: Company - Publ

JOB BUTTON CLICKED
job_title: Quality Metrics and Data Analyst
salary_estimate: $35K - $49K (Glassdoor est.)
location: Liverpool, NY
rating: 3.5
size: 501 to 1000 Employees
founded: nan
type_of_ownership: Government
industry: Health Care Services & Hospitals
sector: Health Care
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Washington State
rating: 3.7
size: 201 to 500 Employees
founded: 2010
type_of_ownership: Company - Private
industry: IT Services
sector: Information Technology
revenue: $25 to $50 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Product Analyst
salary_estimate: nan
location: Stevens Point, WI
rating: 4.3
size: 1001 to 5000 Employees
founded: 1904
type_of_ownership: Company - Private
industry: Insurance Carriers
sector: Insurance
revenue: $2 to $5 billion (USD)
---------------------------
--------

JOB BUTTON CLICKED
job_title: Accessibility Data Analyst (auditor)
salary_estimate: nan
location: Chicago, IL
rating: 4.4
size: 1001 to 5000 Employees
founded: 1986
type_of_ownership: Company - Private
industry: Staffing & Outsourcing
sector: Business Services
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Bowie, MD
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Springdale, AR
rating: 3.7
size: 10000+ Employees
founded: 1948
type_of_ownership: Company - Public
industry: Staffing & Outsourcing
sector: Business Services
revenue: $2 to $5 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Visualization Analyst
salary_estimate: $34K - $65K (

JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: $33K - $57K (Glassdoor est.)
location: Evansville, IN
rating: 4.5
size: 1001 to 5000 Employees
founded: 2005
type_of_ownership: Company - Private
industry: Cable, Internet & Telephone Providers
sector: Telecommunications
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst, Advanced Analytics
salary_estimate: $47K - $84K (Glassdoor est.)
location: Richmond, VA
rating: 3.8
size: 5001 to 10000 Employees
founded: 1919
type_of_ownership: Company - Public
industry: Consumer Products Manufacturing
sector: Manufacturing
revenue: $10+ billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Marketing Data Analyst – Remote
salary_estimate: nan
location: Columbus, OH
rating: 4.7
size: 1 to 50 Employees
founded: nan
type_of_ownership: Company - Private
industry: nan
sector: nan
revenue: Unknown / Non-Applicable
-----

In [4]:
df.sample(10)

Unnamed: 0,job_title,salary_estimate,location,rating,size,founded,type_of_ownership,industry,sector,revenue
86,Clinical Informatics Analyst- UAB Data Analyti...,$53K - $70K (Glassdoor est.),"Birmingham, AL",3.8,10000+ Employees,1945.0,Hospital,Health Care Services & Hospitals,Health Care,Unknown / Non-Applicable
34,Data Analyst,$47K - $84K (Glassdoor est.),"Brooklyn, NY",4.4,51 to 200 Employees,1999.0,Company - Private,"Gift, Novelty & Souvenir Stores",Retail,$10 to $25 million (USD)
23,Quality Metrics and Data Analyst,$35K - $49K (Glassdoor est.),"Liverpool, NY",3.5,501 to 1000 Employees,,Government,Health Care Services & Hospitals,Health Care,Unknown / Non-Applicable
55,Data Analyst,,"Atlanta, GA",4.1,10000+ Employees,1966.0,Company - Public,IT Services,Information Technology,$1 to $2 billion (USD)
88,Operations Data Analyst,$32K - $57K (Glassdoor est.),"Rancho Cucamonga, CA",3.2,1001 to 5000 Employees,1976.0,Company - Private,Logistics & Supply Chain,Transportation & Logistics,Unknown / Non-Applicable
33,"Data Analyst I (3rd Shift, Wednesday-Saturday ...",,Colorado,3.9,1 to 50 Employees,1997.0,Company - Private,,,Unknown / Non-Applicable
76,Data Analyst,$74K - $91K (Glassdoor est.),"Winchester, VA",3.8,1001 to 5000 Employees,1940.0,Company - Private,Architectural & Engineering Services,Business Services,$1 to $2 billion (USD)
79,Pharma Data Architect / System Analyst,,"Princeton, NJ",,Unknown,,,,,
2,NAM Card AML Oversight Data Analytics Lead Ana...,$63K - $106K (Glassdoor est.),"Urbandale, IA",3.9,10000+ Employees,1812.0,Company - Public,Investment Banking & Asset Management,Finance,$10+ billion (USD)
28,Data Privacy Analyst,$47K - $53K (Glassdoor est.),"Quincy, MA",3.3,51 to 200 Employees,2010.0,Company - Private,Biotech & Pharmaceuticals,Biotech & Pharmaceuticals,Unknown / Non-Applicable


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   job_title          90 non-null     object
 1   salary_estimate    53 non-null     object
 2   location           90 non-null     object
 3   rating             83 non-null     object
 4   size               90 non-null     object
 5   founded            73 non-null     object
 6   type_of_ownership  84 non-null     object
 7   industry           79 non-null     object
 8   sector             79 non-null     object
 9   revenue            84 non-null     object
dtypes: object(10)
memory usage: 7.2+ KB


In [7]:
df.type_of_ownership.value_counts()

Company - Private                 37
Company - Public                  29
Subsidiary or Business Segment     8
Nonprofit Organization             5
Hospital                           2
Contract                           2
Government                         1
Name: type_of_ownership, dtype: int64

In [15]:
df.type_of_ownership.nunique()

7

In [8]:
df.industry.value_counts()

Staffing & Outsourcing                     15
IT Services                                 8
Health Care Services & Hospitals            6
Insurance Carriers                          6
Aerospace & Defense                         5
Banks & Credit Unions                       5
Consulting                                  3
Computer Hardware & Software                2
Biotech & Pharmaceuticals                   2
Lending                                     2
Investment Banking & Asset Management       2
Research & Development                      2
Express Delivery Services                   2
General Merchandise & Superstores           1
Home Centers & Hardware Stores              1
Internet                                    1
Architectural & Engineering Services        1
Advertising & Marketing                     1
Transportation Management                   1
Accounting                                  1
Food & Beverage Manufacturing               1
Logistics & Supply Chain          

In [14]:
df.industry.nunique()

32

In [13]:
df.location.nunique()

73

In [6]:
df.to_csv('jobs3', index=False)