In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [24]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path="C:/Users/Ali/webdriver/chromedriver/chromedriver.exe", options=options)
    driver.set_window_size(1920,1080)

    url = "https://www.glassdoor.com/Job/united-states-"+keyword+"-jobs-SRCH_IL.0,13_IN1_KO14,28.htm"
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Let the page load. Change this number based on your internet speed.
        time.sleep(3)

        #Test for the "Sign Up" prompt
        try:
            driver.find_element_by_class_name("selected").click()
  
        except ElementClickInterceptedException:

            pass

        time.sleep(0.1)
        
        #Finding the X button of the signup Prompt and clicking it
        try:
            driver.find_element_by_xpath('.//button[@class="e1jbctw80 ei0fd8p1 css-1n14mz9 e1q8sty40"]').click()  
    
        except NoSuchElementException:
        
            pass

        
        #Find Every job list in the page
        job_buttons = driver.find_elements_by_css_selector('[data-test="jobListing"]')  

        for job_button in job_buttons:  
            if len(jobs) % 10 == 0:
                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
                
            if len(jobs) >= num_jobs:
                break

            job_button.click()  #Click on the Job list 
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_css_selector('[data-test="employerName"]').text
                    location = driver.find_element_by_css_selector('[data-test="location"]').text
                    job_title = driver.find_element_by_css_selector('[class="css-1vg6q84 e1tk4kwz4"]').text
                    driver.find_element_by_xpath('.//div[@class="css-t3xrds e856ufb4"]').click()
                    job_description = driver.find_element_by_css_selector('[class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    pass


            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="css-1xe2xww e1wijj242"]').text
            except NoSuchElementException:
                salary_estimate = -1   #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_css_selector('[data-test="detailRating"]').text
            except NoSuchElementException:
                rating = -1   

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))


            try:
                size = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Size"]//following-sibling::*').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Industry"]//following-sibling::*').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Sector"]//following-sibling::*').text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Revenue"]//following-sibling::*').text
            except NoSuchElementException:
                revenue = -1

            # For Debugging
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            #add job to jobs as list of dicts
            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//button[@class="nextButton job-search-opoz2d e13qs2072"]').click()
        except NoSuchElementException:
            
            print(f"Scraping terminated before reaching target number of jobs. Needed {num_jobs}, got {len(jobs)}.")
            break
    driver.close()
    return pd.DataFrame(jobs)  #This line converts the list of dicts object into a pandas DataFrame.


In [25]:
#This line will open a new chrome window and start the scraping.
ds_df = get_jobs("data scientist", 10, False)
ds_df

Progress: 0/10
Progress: 10/10


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Scientist,Employer Provided Salary:$85K - $205K,"At PayPal (NASDAQ: PYPL), we believe that ever...",3.9,PayPal\n3.9,"New York, NY",10000+ Employees,1998,Company - Public,Internet & Web Services,Information Technology,$10+ billion (USD)
1,Data Scientist,$89K - $124K (Glassdoor est.),Do you have a passion for artificial intellige...,4.0,Deloitte\n4.0,"Horsham, PA",10000+ Employees,1850,Company - Private,Accounting & Tax,Financial Services,$10+ billion (USD)
2,Data Scientist,-1,"Who are we?\nBuyers Edge Platform (""BEP"") was ...",4.3,"Buyers Edge Platform, LLC\n4.3",Remote,Unknown,-1,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
3,Data Scientist/Statistician I,Employer Provided Salary:$75K,"Veteran Engineering and Technology, LLC (VET) ...",4.4,Veteran Engineering and Technology\n4.4,"Washington, DC",1 to 50 Employees,2003,Contract,National Agencies,Government & Public Administration,$1 to $5 million (USD)
4,Data Scientist I,$97K - $126K (Glassdoor est.),Our Purpose\nWe work to connect and power an i...,4.3,Mastercard\n4.3,"Salt Lake City, UT",10000+ Employees,1966,Company - Public,Financial Transaction Processing,Financial Services,Unknown / Non-Applicable
5,Data Science Code reviewer,-1,"At TripleTen, we're building an international ...",-1.0,TripleTen,Idaho,51 to 200 Employees,-1,Company - Private,-1,-1,Unknown / Non-Applicable
6,Data Scientist,-1,Company Overview\nIdea Evolver specializes in ...,4.1,Idea Evolver\n4.1,Remote,1 to 50 Employees,2011,Company - Private,Advertising & Public Relations,Media & Communication,Unknown / Non-Applicable
7,Associate Data & Decision Scientist,Employer Provided Salary:$93K - $103K,Department: Network Planning\n\nOur Company Pr...,4.1,Southwest Airlines\n4.1,"Dallas, TX",10000+ Employees,1967,Company - Public,"Airlines, Airports & Air Transportation",Transportation & Logistics,$10+ billion (USD)
8,Junior Data Scientist/Data Modeler,-1,Overview:\nLMI is seeking a skilled Data Scien...,4.3,Logistics Management Institute\n4.3,Remote,1001 to 5000 Employees,1961,Company - Private,Business Consulting,Management & Consulting,$100 to $500 million (USD)
9,Data Scientist,-1,Company Description\n\nDropbox is a special pl...,4.5,Dropbox\n4.5,Arizona,1001 to 5000 Employees,2007,Company - Private,Computer Hardware Development,Information Technology,Unknown / Non-Applicable


In [16]:
ds_df.to_csv(r'..\Data\Raw\raw_data_ds.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
ml_df = get_jobs("Machine Learning", 1000, False)
ml_df

In [None]:
ml_df.to_csv(r'.\Data\Raw\raw_data_ml.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
da_df = get_jobs("Data Analyst", 1000, False)
da_df

In [None]:
da_df.to_csv(r'.\Data\Raw\raw_data_da.csv')

In [26]:
#This line will open a new chrome window and start the scraping.
de_df = get_jobs("Data Engineer", 1000, False)
de_df

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=116.0.5845.97)


In [28]:
de_df.to_csv(r'.\Data\Raw\raw_data_de.csv')

NameError: name 'de_df' is not defined