In [1]:

from selenium import webdriver
import sys
import time
#sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import pandas as pd


### setup
antes de correr debes descargar el chrome driver acorde a tu maquina
https://sites.google.com/chromium.org/driver/downloads


### helper functions

In [2]:
def init_scrapper(keyword="data science", headless = False ):
    """"initialize the glassdor scrapper given a job keyword"""
    # Initializing the webdriver
    options = webdriver.ChromeOptions()

    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    if headless: options.add_argument('headless')

    #Change the path to where chromedriver is in your home folder.
    #driver = webdriver.Chrome(executable_path=driverpath, options=options)

    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType="
    driver.get(url)

    return driver


def get_companyname_rating(company_name):
    """gets the company name and (if exist) gets the company rating too"""
    splited = company_name.split("\n")
    #company name comes with rating 
    if len(splited) > 1:
        return splited[0],  splited[1]
    #just company name
    else:
        return company_name, -1


def get_company_overview_data(Company_Overview):
    """"gets the company overview data (dont the name of the func speaks for itself?)"""
    Company_Overview_dict = {"size":"",
                            "founded":"",
                            "type":"",
                            "industry":"",
                            "sector":"",
                            "revenue":"",}

    for i, key in enumerate(Company_Overview_dict):
        try: 
            Company_Overview_dict[key] = Company_Overview[i].text
        except:
            print(f"{key} not found")
            Company_Overview_dict[key] = -1
    return Company_Overview_dict


def response_to_dataframe(list_of_dicts,filename):
  """cast list of dict into df, if filename is specified exports the csv """
  df = pd.DataFrame(list_of_dicts)
  if filename:
    df.to_csv(f'./{filename}.csv')
  return df



In [3]:
#path = ""
keyword = "data science"
init_scrapper(keyword)




### SCRAPPER

In [3]:
def get_jobs(keyword, num_jobs, verbose, slp_time, failsafe = False):
    
    """Gathers jobs as a dataframe, scraped from Glassdoor"""
    driver = init_scrapper(keyword)
    jobs = []
    #Let the page load. Change this number based on your internet speed.
    time.sleep(2*slp_time)
    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Going through each job in this page
        job_buttons = driver.find_elements(By.CLASS_NAME, "react-job-listing")  #react-job-listing for Job Listing. These are the buttons we're going to click.
        print(f"size job buttons {len(job_buttons)}")
        for job_button in job_buttons:  

            print(f"Progress:  {len(jobs)}/{ num_jobs}")
            if len(jobs) >= num_jobs:
                break
            job_button.click()  #You might 
            time.sleep(slp_time)
            # Test for the "Sign Up" prompt and get rid of it.
            # close modal (just appear on the first pass)
            if len(jobs) == 0:
                try:
                    driver.find_element(By.XPATH, '//*[@id="JAModal"]/div/div[2]/span').click()
                    #driver.find_element(By.CLASS_NAME,'modal_closeIcon').click()
                    print("modal cerrado con exito")
                except NoSuchElementException:
                    print("no hay modal xd")
                    
            collected_successfully = False
            while not collected_successfully:
                try:

                    company_name_ = driver.find_element(By.CLASS_NAME, 'e1tk4kwz5').text
                    #sometimes company name comes with the rating, if its the case get_companyname_rating handles it
                    company_name, rating = get_companyname_rating(company_name_) 
                    location = driver.find_element(By.CLASS_NAME,'e1tk4kwz1').text
                    job_title = driver.find_element(By.CLASS_NAME,'e1tk4kwz2').text
                    job_description = driver.find_element(By.XPATH,'.//div[@class="jobDescriptionContent desc"]').text
                    salary_estimate = driver.find_element(By.CLASS_NAME,'e2u4hf18').text if driver.find_element(By.CLASS_NAME,'e2u4hf18') else "N/A"
                    collected_successfully = True
                    print("job info collected successfully :)")
                except:
                    time.sleep(slp_time/2)
                    print("job info error, trying again...")
                    break

            job_data_dict = {"Job Title" : job_title,
                            "Company Name" : company_name,
                            "Location" : location,
                            "Salary Estimate" : salary_estimate,
                            "Rating" : rating,
                            "Job Description" : job_description,
                        }

            #Printing for debugging
            if verbose:
                print(job_data_dict)

            #Going to the Company tab...
            #clicking on this:
            #<div class="tab" data-tab-type="overview"><span>Company</span></div>
            try:
                Company_Overview = driver.find_elements(By.CLASS_NAME,'e1pvx6aw0')
                company_overview_dict = get_company_overview_data(Company_Overview)

            except NoSuchElementException:  #Rarely, some job postings do not have the "Company" tab.
                company_overview_dict = {"size":-1,
                                        "founded":-1,
                                        "type":-1,
                                        "industry":-1,
                                        "sector":-1,
                                        "revenue":-1}
                
            if verbose:
                print(company_overview_dict)
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
            
            job_data_dict.update(company_overview_dict)

            jobs.append(job_data_dict)
            if failsafe: response_to_dataframe(jobs,"jobs_df")

            #add job to jobs
            
            
        #Clicking on the "next page" button
        try:
            driver.find_element(By.CLASS_NAME,'nextButton').click()
            time.sleep(slp_time/2)
        except NoSuchElementException:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
        
    driver.close()
    return pd.DataFrame(jobs)  #This line converts the dictionary object into a pandas DataFrame.

In [4]:
# https://stackoverflow.com/questions/60362018/macos-catalinav-10-15-3-error-chromedriver-cannot-be-opened-because-the-de
#path = "/opt/homebrew/bin/chromedriver"
keyword = "data science"
num_jobs = 30
slp_time = 6
verbose = True
failsafe = True

df = get_jobs(keyword, num_jobs, verbose, slp_time,failsafe)
df


size job buttons 30
Progress:  0/30
modal cerrado con exito
job info collected successfully :)
{'Job Title': 'Junior Data Scientist', 'Company Name': 'Talentheed Inc', 'Location': 'Remote', 'Salary Estimate': '$100,000 /yr (est.)', 'Rating': -1, 'Job Description': 'Responsibilities:\nWork with stakeholders to determine how to use business data for valuable business solutions.\nSearch for ways to get new data sources and assess their accuracy.\nBrowse and analyze enterprise databases to simplify and improve product development, marketing techniques, and business processes.\nCreate custom data models and algorithms.\nUse predictive models to improve customer experience, ad targeting, revenue generation, and more.\nDevelop the organization’s test model quality and A/B testing framework.\nCoordinate with various technical/functional teams to implement models and monitor results.'}
size not found
founded not found
type not found
industry not found
sector not found
revenue not found
{'size'

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,size,founded,type,industry,sector,revenue
0,Junior Data Scientist,Talentheed Inc,Remote,"$100,000 /yr (est.)",-1.0,Responsibilities:\nWork with stakeholders to d...,-1,-1,-1,-1,-1,-1
1,Mgr Data Science,South West,"Dallas, TX","$103,267 /yr (est.)",4.2,Department: Business Transformation\n\nOur Com...,10000+ Employees,1967,Company - Public,"Airlines, Airports & Air Transportation",Transportation & Logistics,$10+ billion (USD)
2,Junior Data Analyst,Human Exponent Inc.,Remote,"$103,267 /yr (est.)",5.0,We are seeking a highly motivated analyst to j...,1 to 50 Employees,Company - Private,Unknown / Non-Applicable,-1,-1,-1
3,Gcp data engineer,Infinity Quest,Remote,$100.00 /hr (est.),4.3,"Hello ,\nGood day,\nInfinity Quest is seeking ...",201 to 500 Employees,2006,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
4,Data Scientist - Personalization,Audible,"Newark, NJ","$85,503 /yr (est.)",4.0,Job summary\nGood storytelling starts with gre...,1001 to 5000 Employees,1995,Subsidiary or Business Segment,Film Production,Media & Communication,Unknown / Non-Applicable
5,Jr. Data Analyst,PepsiCo,"Chicago, IL","$57,332 /yr (est.)",3.9,"This position can be located in Plano, Texas -...",10000+ Employees,1965,Company - Public,Food & Beverage Manufacturing,Manufacturing,$10+ billion (USD)
6,Data Scientist,IV IT Solutions,Remote,$82.50 /hr (est.),5.0,"Hello,\nHope you are doing great!\nThis is jam...",1 to 50 Employees,Company - Private,Unknown / Non-Applicable,-1,-1,-1
7,Data Science Assistant,United Nations,"New York, NY","$88,585 /yr (est.)",4.0,This position is located in the Analytics Sect...,10000+ Employees,1945,Government,National Agencies,Government & Public Administration,Unknown / Non-Applicable
8,Data Annotator,Skit.ai,"New York, NY","$46,817 /yr (est.)",4.5,About us\nSkit (previously known as Vernacular...,51 to 200 Employees,2016,Company - Private,Enterprise Software & Network Solutions,Information Technology,$1 to $5 million (USD)
9,Data Science and Signal Engineer I,Medtronic,"Northridge, CA","$46,817 /yr (est.)",4.1,Careers that Change Lives\nEngineers create ou...,10000+ Employees,1949,Company - Public,Health Care Products Manufacturing,Manufacturing,$10+ billion (USD)


In [30]:
response_to_dataframe(df,"Glassdoor_jobs1")

Unnamed: 0,Job Title,Company Name,Location,Salary Estimate,Rating,Job Description,size,founded,type,industry,sector,revenue
0,Jr. Data Scientist,Net2Aspire,Remote,"$72,500 /yr (est.)",-1.0, Apply Statistical and Machine Learning metho...,-1,-1,-1,-1,-1,-1
1,Data Scientist - Intermediate,Envision,Remote,"$72,500 /yr (est.)",1.0,Basic Qualifications:\n1. Master's Degree in S...,Unknown,Company - Private,Unknown / Non-Applicable,-1,-1,-1
2,Data Analyst,Hulu,"Santa Monica, CA","$75,266 /yr (est.)",3.9,We are seeking a Data Analyst who will focus o...,1001 to 5000 Employees,2007,Company - Private,Broadcast Media,Media & Communication,$1 to $2 billion (USD)
3,Data Scientist,Mint-Technologies,"Richmond, VA",$75.00 /hr (est.),-1.0,*will work ONSITE 2-3 days each week\nThe data...,1 to 50 Employees,Company - Private,Unknown / Non-Applicable,-1,-1,-1
4,Data Engineer,Vedainfo,Remote,$55.00 /hr (est.),4.1,Data Engineer\nLocation: This position is remo...,201 to 500 Employees,Company - Private,Information Technology Support Services,Information Technology,$1 to $5 million (USD),-1
5,Data Scientist,SIL International,Remote,"$70,000 /yr (est.)",4.3,Job Description Summary:\nThe data scientist w...,51 to 200 Employees,Nonprofit Organization,Culture & Entertainment,"Arts, Entertainment & Recreation",Unknown / Non-Applicable,-1
6,Data Science Consultant,Accenture,"San Diego, CA","$105,150 /yr (est.)",4.1,"We are:\nApplied Intelligence, the people who ...",10000+ Employees,1989,Company - Public,Business Consulting,Management & Consulting,$10+ billion (USD)
7,Data Engineer,WALTLabs.io,Remote,"$110,000 /yr (est.)",-1.0,WALTLabs.io is looking for an experienced and ...,-1,-1,-1,-1,-1,-1
8,Data Science Engineer,Moorecroft Systems,Remote,$52.50 /hr (est.),-1.0,Senior Data Science Engineer\nMoorecroft is lo...,51 to 200 Employees,Company - Public,Unknown / Non-Applicable,-1,-1,-1
9,Data Scientist,Geocomp,Remote,$55.00 /hr (est.),3.9,Geocomp (a Sercel company) is looking for an e...,51 to 200 Employees,1983,Company - Private,Architectural & Engineering Services,"Construction, Repair & Maintenance Services",$10 to $25 million (USD)


In [31]:
df.to_csv('jobs.csv')