In [2]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [3]:
def get_jobs(keyword, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    #options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path="C:/Users/Ali/webdriver/chromedriver/chromedriver.exe", options=options)
    driver.set_window_size(1920,1080)

    url = "https://www.glassdoor.com/Job/jobs.htm?suggestCount=0&suggestChosen=false&clickSource=searchBtn&typedKeyword="+keyword+"&sc.keyword="+keyword+"&locT=&locId=&jobType="
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Let the page load. Change this number based on your internet speed.
        time.sleep(3)

        #Test for the "Sign Up" prompt
        try:
            driver.find_element_by_class_name("selected").click()
  
        except ElementClickInterceptedException:

            pass

        time.sleep(0.1)
        
        #Finding the X button of the signup Prompt and clicking it
        try:
            driver.find_element_by_xpath('.//button[@class="e1jbctw80 ei0fd8p1 css-1n14mz9 e1q8sty40"]').click()  
    
        except NoSuchElementException:
        
            pass

        
        #Find Every job list in the page
        job_buttons = driver.find_elements_by_css_selector('[data-test="jobListing"]')  

        for job_button in job_buttons:  

            print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
            if len(jobs) >= num_jobs:
                break

            job_button.click()  #Click on the Job list 
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_css_selector('[data-test="employerName"]').text
                    location = driver.find_element_by_css_selector('[data-test="location"]').text
                    job_title = driver.find_element_by_css_selector('[class="css-1vg6q84 e1tk4kwz4"]').text
                    job_description = driver.find_element_by_css_selector('[class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    pass


            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="css-1xe2xww e1wijj242"]').text
            except NoSuchElementException:
                salary_estimate = -1   #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_css_selector('[data-test="detailRating"]').text
            except NoSuchElementException:
                rating = -1   

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))


            try:
                size = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Size"]//following-sibling::*').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Industry"]//following-sibling::*').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Sector"]//following-sibling::*').text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Revenue"]//following-sibling::*').text
            except NoSuchElementException:
                revenue = -1

            # For Debugging
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            #add job to jobs as list of dicts
            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//button[@class="nextButton job-search-opoz2d e13qs2072"]').click()
        except NoSuchElementException:
            
            print(f"Scraping terminated before reaching target number of jobs. Needed {num_jobs}, got {len(jobs)}.")
            break

    return pd.DataFrame(jobs)  #This line converts the list of dicts object into a pandas DataFrame.

In [None]:
#This line will open a new chrome window and start the scraping.
df = get_jobs("data scientist", 1000, False)
df

In [None]:
df.to_csv(r'.\Data\Raw\raw_data0.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
df1 = get_jobs("Machine Learning", 1000, False)
df1

In [None]:
df1.to_csv(r'.\Data\Raw\raw_data1.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
df2 = get_jobs("Data Analyst", 1000, False)
df2

In [None]:
df2.to_csv(r'.\Data\Raw\raw_data2.csv')

In [7]:
#This line will open a new chrome window and start the scraping.
df3 = get_jobs("Data Engineer", 1000, False)
df3

Progress: 0/1000
Progress: 1/1000
Progress: 2/1000
Progress: 3/1000
Progress: 4/1000
Progress: 5/1000
Progress: 6/1000
Progress: 7/1000
Progress: 8/1000
Progress: 9/1000
Progress: 10/1000
Progress: 11/1000
Progress: 12/1000
Progress: 13/1000
Progress: 14/1000
Progress: 15/1000
Progress: 16/1000
Progress: 17/1000
Progress: 18/1000
Progress: 19/1000
Progress: 20/1000
Progress: 21/1000
Progress: 22/1000
Progress: 23/1000
Progress: 24/1000
Progress: 25/1000
Progress: 26/1000
Progress: 27/1000
Progress: 28/1000
Progress: 29/1000
Progress: 30/1000
Progress: 31/1000
Progress: 32/1000
Progress: 33/1000
Progress: 34/1000
Progress: 35/1000
Progress: 36/1000
Progress: 37/1000
Progress: 38/1000
Progress: 39/1000
Progress: 40/1000
Progress: 41/1000
Progress: 42/1000
Progress: 43/1000
Progress: 44/1000
Progress: 45/1000
Progress: 46/1000
Progress: 47/1000
Progress: 48/1000
Progress: 49/1000
Progress: 50/1000
Progress: 51/1000
Progress: 52/1000
Progress: 53/1000
Progress: 54/1000
Progress: 55/1000
Pr

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Engineer,$74K - $104K (Glassdoor est.),"Work Arrangement: This is an onsite role, requ...",4.5,Visionary Homes\n4.5,"North Logan, UT",51 to 200 Employees,2004,Company - Private,Construction,"Construction, Repair & Maintenance Services",Unknown / Non-Applicable
1,Big Data Engineer,Employer Provided Salary:$50.00 - $60.00 Per Hour,Job Description:\nExperience working with Hado...,-1,Reuben Cooley Inc.,"Phoenix, AZ",Unknown,-1,Company - Public,-1,-1,Unknown / Non-Applicable
2,Data Engineer,Employer Provided Salary:$65.00 - $70.00 Per Hour,Job title: Data Engineer ( USC GC )\nLocation:...,-1,Kommforcesolutions,"Dallas, TX",-1,-1,-1,-1,-1,-1
3,Data Engineer,Employer Provided Salary:$40.00 - $45.00 Per Hour,"Location: Foster City, CA (Onsite DAY ONE)\nMi...",5.0,APPIC Solutions LLC\n5.0,"Foster City, CA",1 to 50 Employees,2017,Company - Private,Software Development,Information Technology,$1 to $5 million (USD)
4,AWS Data Engineer,Employer Provided Salary:$68.00 - $72.00 Per Hour,"Skills: Python, Snowflake, AWS, Data Bricks\nM...",3.9,Apolis\n3.9,"El Segundo, CA",501 to 1000 Employees,1996,Company - Private,Information Technology Support Services,Information Technology,$25 to $100 million (USD)
...,...,...,...,...,...,...,...,...,...,...,...,...
995,Data Analytics Engineer,$99K - $134K (Glassdoor est.),"KAYAK, part of Booking Holdings (NASDAQ: BKNG)...",4.7,KAYAK\n4.7,"Concord, MA",1001 to 5000 Employees,2004,Subsidiary or Business Segment,Internet & Web Services,Information Technology,$100 to $500 million (USD)
996,Data Engineer - Remote,Employer Provided Salary:$77K - $116K,Our work matters. We help people get the medic...,3.2,Prime Therapeutics\n3.2,Remote,5001 to 10000 Employees,1998,Company - Private,Insurance Carriers,Insurance,$10+ billion (USD)
997,Senior Data Engineer,-1,A little about Rupa: The future is personalize...,4.6,Rupa Health\n4.6,Remote,1 to 50 Employees,-1,Company - Public,-1,-1,Unknown / Non-Applicable
998,Data Engineer,-1,What are we building?\nHard Rock Digital is a ...,3.3,Hard Rock Digital\n3.3,Remote,51 to 200 Employees,2020,Company - Private,Gambling,"Arts, Entertainment & Recreation",Unknown / Non-Applicable


In [8]:
df3.to_csv(r'.\Data\Raw\raw_data3.csv')