In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, StaleElementReferenceException
from selenium import webdriver
import time
import pandas as pd
import numpy as np
import math

exps = (NoSuchElementException, StaleElementReferenceException)

In [2]:
def get_jobs(keyword, pages, verbose=False):
    
    """Method to collect job listings from Glassdoor.
        
        Args: 
            keyword (string): the name of the job title for which you want to collect jobs
            pages (int): the number of pages you want to collect(around 30 jobs/page)
        
        Returns: 
            pandas.dataframe: a dataframe with all the jobs and their details
    
        """
    
    options = webdriver.ChromeOptions()
    
    #give path to chromedriver
    exec_path_mac = '/Users/mahmoudhamra/Dropbox/GitHub Projects/da_salary_proj/scraping-glassdoor-selenium-master/chromedriver'
    exec_path_pc = 'C:/Users/mahmo/Dropbox/GitHub Projects/da_salary_proj/chromedriver'
    driver = webdriver.Chrome(executable_path=exec_path_pc, options=options)
    

    driver.set_window_size(1120, 1000)
    url = 'https://www.glassdoor.com/Job/' + keyword.split()[0] + '-' + keyword.split()[1] + '-jobs-SRCH_KO0,12.htm?includeNoSalaryJobs=true&sortBy=date_desc'    
    driver.get(url)
    df = []    
    
    for i in range(pages):
        
        if i>=1:
            url = 'https://www.glassdoor.com/Job/data-analyst-jobs-SRCH_KO0,12_IP' + str(i+1) + '.htm?includeNoSalaryJobs=true&sortBy=date_desc'
            driver.get(url)
        else:
            pass
        
        driver.implicitly_wait(1)
        
        #collect job listings in the page (30 jobs/page)
        jobs = driver.find_elements_by_xpath('.//*[@id="MainCol"]/div[1]/ul/li')
        
        for job in jobs:
            
            print("---------------------------")
            
            # click on job 
            job.click()
            print("JOB BUTTON CLICKED")
            
            #if sign up window pops up close it
            try:
                driver.find_element_by_css_selector('[alt="Close"]').click()  #clicking to the X.
                print("clicked the X button")
            except NoSuchElementException:
                pass

            
                
            try:
                
                job_title = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[2]').text
            except exps:
                job_title = np.nan
                
            
            try:
                rating = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[1]/span').text
            except exps:
                rating = np.nan
                
            try:
                salary_estimate = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[4]/span').text
            except exps:
                salary_estimate = np.nan
            
            try:
                location = job.find_element_by_xpath('//*[@id="JDCol"]/div/article/div/div[1]/div/div/div[1]/div[3]/div[1]/div[3]').text
            except exps:
                location = np.nan
                                

            #Looping over Company Overview section and appending information to comp_info list
            comp_info = []
            elems = driver.find_elements_by_xpath('.//div[@id="EmpBasicInfo"]//div//div[@class="d-flex flex-wrap"]//div[@class="d-flex justify-content-start css-rmzuhb e1pvx6aw0"]')
            for element in elems:
                try:
                    driver.implicitly_wait(5)
                    #taking key value pairs ex: 'size', 1001 to 5000 Employees
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"]').text)
                    comp_info.append(element.find_element_by_xpath('.//span[@class="css-i9gxme e1pvx6aw2"]').text)

                except StaleElementReferenceException:
                    print("##################StaleElementReferenceException")
                        
                    
            # Convert a list to dictionary
            it = iter(comp_info)
            comp_info = dict(zip(it, it))
            
           

            try:
                size = comp_info['Size']
            except KeyError:
                size = "Unknown"
                
                            
            try:
                founded = comp_info['Founded']
            except KeyError:
                founded = np.nan
                

            try:
                type_of_ownership = comp_info['Type']
            except KeyError:
                type_of_ownership = np.nan
                

            try:
                industry = comp_info['Industry']
            except KeyError:
                industry = np.nan
                

            try:
                sector = comp_info['Sector']
            except KeyError:
                sector = np.nan
                

            try:
                revenue = comp_info['Revenue']
            except KeyError:
                revenue = np.nan
                
            if verbose:
                print('job_title:' , job_title)
                print('salary_estimate:', salary_estimate)
                print('location:', location)
                print('rating:', rating)
                print('size:', size)
                print('founded:', founded)
                print('type_of_ownership:', type_of_ownership)
                print('industry:', industry)
                print('sector:', sector)
                print('revenue:', revenue)
                
                
            print("---------------------------")
            
            #add job details to df list
            df.append({
            "job_title": job_title,
            "salary_estimate": salary_estimate,
            "location": location,
            "rating" : rating,
            "size" : size,
            "founded" : founded,
            "type_of_ownership" : type_of_ownership,
            "industry" : industry,
            "sector" : sector,
            "revenue" : revenue})
         
    return pd.DataFrame(df)  #This line converts the dictionary object into a pandas DataFrame  
        
    

In [None]:
df = get_jobs("data analyst", 30, verbose= True)

---------------------------
JOB BUTTON CLICKED
clicked the X button
job_title: Data Analyst Internship
salary_estimate: nan
location: Houston, TX
rating: 4.0
size: 10000+ Employees
founded: 1969
type_of_ownership: Company - Public
industry: Investment Banking & Asset Management
sector: Finance
revenue: $5 to $10 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Boston, MA
rating: 4.4
size: 51 to 200 Employees
founded: 2007
type_of_ownership: Company - Private
industry: Staffing & Outsourcing
sector: Business Services
revenue: $25 to $50 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Senior Data and Reporting Analyst
salary_estimate: nan
location: Hartford, CT
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title

JOB BUTTON CLICKED
job_title: Data Operations Analyst
salary_estimate: $39K - $57K (Glassdoor est.)
location: Jacksonville, FL
rating: 4.0
size: 10000+ Employees
founded: 1969
type_of_ownership: Company - Public
industry: Investment Banking & Asset Management
sector: Finance
revenue: $5 to $10 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data and Operational Analyst
salary_estimate: nan
location: South Carolina
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Senior Data & Reporting Analyst - BMR
salary_estimate: nan
location: Des Moines, IA
rating: 3.6
size: 501 to 1000 Employees
founded: nan
type_of_ownership: Company - Private
industry: Staffing & Outsourcing
sector: Business Services
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
j

JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Bowie, MD
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Springdale, AR
rating: 3.7
size: 10000+ Employees
founded: 1948
type_of_ownership: Company - Public
industry: Staffing & Outsourcing
sector: Business Services
revenue: $2 to $5 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Visualization Analyst
salary_estimate: $34K - $65K (Glassdoor est.)
location: Vienna, VA
rating: 4.2
size: 51 to 200 Employees
founded: 2012
type_of_ownership: Company - Public
industry: Consulting
sector: Business Services
revenue: $1 to $5 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Quality Assurance Analyst
salary_estimate: $51K - $94K (

JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Weston, FL
rating: 4.1
size: 501 to 1000 Employees
founded: nan
type_of_ownership: Company - Public
industry: nan
sector: nan
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: $33K - $57K (Glassdoor est.)
location: Evansville, IN
rating: 4.5
size: 1001 to 5000 Employees
founded: 2005
type_of_ownership: Company - Private
industry: Cable, Internet & Telephone Providers
sector: Telecommunications
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst, Advanced Analytics
salary_estimate: $47K - $84K (Glassdoor est.)
location: Richmond, VA
rating: 3.8
size: 5001 to 10000 Employees
founded: 1919
type_of_ownership: Company - Public
industry: Consumer Products Manufacturing
sector: Manufacturing
revenue: $10+ billion (USD)
-----------------------

JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: nan
location: Southfield, MI
rating: 3.5
size: 5001 to 10000 Employees
founded: nan
type_of_ownership: Company - Private
industry: Consulting
sector: Business Services
revenue: $100 to $500 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Quantitative Data Analyst
salary_estimate: $60K - $63K (Glassdoor est.)
location: Cambridge, MA
rating: 2.3
size: 51 to 200 Employees
founded: 2005
type_of_ownership: Company - Private
industry: Internet
sector: Information Technology
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Research Analyst - Data Science Division
salary_estimate: $66K - $132K (Glassdoor est.)
location: Arlington, VA
rating: 3.6
size: 501 to 1000 Employees
founded: 1942
type_of_ownership: Nonprofit Organization
industry: Aerospace & Defense
sector: Aerospace & Defense
revenue: $100 to $500 million

JOB BUTTON CLICKED
job_title: SQL Data analyst + Support Exp.
salary_estimate: nan
location: Redwood City, CA
rating: 4.1
size: 1 to 50 Employees
founded: nan
type_of_ownership: Company - Private
industry: Staffing & Outsourcing
sector: Business Services
revenue: $10 to $25 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Quality Analyst
salary_estimate: $45K - $82K (Glassdoor est.)
location: Phoenix, AZ
rating: 3.6
size: 1001 to 5000 Employees
founded: 1986
type_of_ownership: Company - Private
industry: Consulting
sector: Business Services
revenue: $100 to $500 million (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Healthcare Data Analyst
salary_estimate: nan
location: Birmingham, AL
rating: 2.7
size: 1 to 50 Employees
founded: nan
type_of_ownership: Company - Private
industry: Staffing & Outsourcing
sector: Business Services
revenue: Unknown / Non-Applicable
--------------------------

JOB BUTTON CLICKED
job_title: Market Intelligence and Data Analytics Analyst II
salary_estimate: $48K - $82K (Glassdoor est.)
location: Lake Mary, FL
rating: 3.3
size: 10000+ Employees
founded: 1935
type_of_ownership: Company - Public
industry: Wholesale
sector: Business Services
revenue: $10+ billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Analyst
salary_estimate: $37K - $67K (Glassdoor est.)
location: New York, NY
rating: 4.1
size: 51 to 200 Employees
founded: 2015
type_of_ownership: Company - Private
industry: Health Care Services & Hospitals
sector: Health Care
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Senior Product Data Analyst
salary_estimate: nan
location: San Francisco, CA
rating: nan
size: Unknown
founded: nan
type_of_ownership: nan
industry: nan
sector: nan
revenue: nan
---------------------------
---------------------------
JOB BUTTON CLICKE

JOB BUTTON CLICKED
job_title: Senior Data Analyst
salary_estimate: nan
location: Tulsa, OK
rating: 3.5
size: 10000+ Employees
founded: 1933
type_of_ownership: Company - Public
industry: Transportation Management
sector: Transportation & Logistics
revenue: $5 to $10 billion (USD)
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Quality Assurance Analyst - Data Warehouse ETL
salary_estimate: nan
location: Pleasant Prairie, WI
rating: 3.6
size: 5001 to 10000 Employees
founded: 1980
type_of_ownership: Company - Private
industry: Industrial Manufacturing
sector: Manufacturing
revenue: Unknown / Non-Applicable
---------------------------
---------------------------
JOB BUTTON CLICKED
job_title: Data Engineering Analyst
salary_estimate: $67K - $131K (Glassdoor est.)
location: Waltham, MA
rating: 3.4
size: 1001 to 5000 Employees
founded: 1841
type_of_ownership: Company - Public
industry: IT Services
sector: Information Technology
revenue: $1 to $2 billion (

In [None]:
df.sample(10)

In [None]:
df.info()

In [None]:
df.to_csv('jobs3', index=False)