In [1]:
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium import webdriver
import time
import pandas as pd

In [2]:
def get_jobs(url_specific, num_jobs, verbose):
    
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    #Initializing the webdriver
    options = webdriver.ChromeOptions()
    
    #Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    # options.add_argument('headless')
    
    #Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(executable_path="C:/Users/Ali/webdriver/chromedriver/chromedriver.exe", options=options)
    driver.set_window_size(1920,1080)


    driver.get(url_specific)
    jobs = []

    while len(jobs) < num_jobs:  #If true, should be still looking for new jobs.

        #Let the page load. Change this number based on your internet speed.
        time.sleep(3)

        #Test for the "Sign Up" prompt
        try:
            driver.find_element_by_class_name("selected").click()
  
        except ElementClickInterceptedException:

            pass

        time.sleep(0.1)
        
        #Finding the X button of the signup Prompt and clicking it
        try:
            driver.find_element_by_xpath('.//button[@class="e1jbctw80 ei0fd8p1 css-1n14mz9 e1q8sty40"]').click()  
    
        except NoSuchElementException:
        
            pass

        
        #Find Every job list in the page
        job_buttons = driver.find_elements_by_css_selector('[data-test="jobListing"]')  

        for job_button in job_buttons:  
            if len(jobs) % 10 == 0:
                print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
                
            if len(jobs) >= num_jobs:
                break

            job_button.click()  #Click on the Job list 
            time.sleep(1)
            collected_successfully = False
            
            while not collected_successfully:
                try:
                    company_name = driver.find_element_by_css_selector('[data-test="employerName"]').text
                    location = driver.find_element_by_css_selector('[data-test="location"]').text
                    job_title = driver.find_element_by_css_selector('[class="css-1vg6q84 e1tk4kwz4"]').text
                    driver.find_element_by_xpath('.//div[@class="css-t3xrds e856ufb4"]').click()
                    job_description = driver.find_element_by_css_selector('[class="jobDescriptionContent desc"]').text
                    collected_successfully = True
                except:
                    pass


            try:
                salary_estimate = driver.find_element_by_xpath('.//span[@class="css-1xe2xww e1wijj242"]').text
            except NoSuchElementException:
                salary_estimate = -1   #You need to set a "not found value. It's important."
            
            try:
                rating = driver.find_element_by_css_selector('[data-test="detailRating"]').text
            except NoSuchElementException:
                rating = -1   

            #Printing for debugging
            if verbose:
                print("Job Title: {}".format(job_title))
                print("Salary Estimate: {}".format(salary_estimate))
                print("Job Description: {}".format(job_description[:500]))
                print("Rating: {}".format(rating))
                print("Company Name: {}".format(company_name))
                print("Location: {}".format(location))


            try:
                size = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Size"]//following-sibling::*').text
            except NoSuchElementException:
                size = -1

            try:
                founded = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Founded"]//following-sibling::*').text
            except NoSuchElementException:
                founded = -1

            try:
                type_of_ownership = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Type"]//following-sibling::*').text
            except NoSuchElementException:
                type_of_ownership = -1

            try:
                industry = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Industry"]//following-sibling::*').text
            except NoSuchElementException:
                industry = -1

            try:
                sector = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Sector"]//following-sibling::*').text
            except NoSuchElementException:
                sector = -1

            try:
                revenue = driver.find_element_by_xpath('.//span[@class="css-1taruhi e1pvx6aw1"][text()="Revenue"]//following-sibling::*').text
            except NoSuchElementException:
                revenue = -1

            # For Debugging
            if verbose:
                print("Size: {}".format(size))
                print("Founded: {}".format(founded))
                print("Type of Ownership: {}".format(type_of_ownership))
                print("Industry: {}".format(industry))
                print("Sector: {}".format(sector))
                print("Revenue: {}".format(revenue))
                print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")

            #add job to jobs as list of dicts
            jobs.append({"Job Title" : job_title,
            "Salary Estimate" : salary_estimate,
            "Job Description" : job_description,
            "Rating" : rating,
            "Company Name" : company_name,
            "Location" : location,
            "Size" : size,
            "Founded" : founded,
            "Type of ownership" : type_of_ownership,
            "Industry" : industry,
            "Sector" : sector,
            "Revenue" : revenue})
            

        #Clicking on the "next page" button
        try:
            driver.find_element_by_xpath('.//button[@class="nextButton job-search-opoz2d e13qs2072"]').click()
        except NoSuchElementException:
            
            print(f"Scraping terminated before reaching target number of jobs. Needed {num_jobs}, got {len(jobs)}.")
            break
    driver.close()
    return pd.DataFrame(jobs)  #This line converts the list of dicts object into a pandas DataFrame.


In [4]:
#This line will open a new chrome window and start the scraping.
ds_url = "https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14.htm"
ds_df = get_jobs(ds_url, 1000, False)
ds_df

Progress: 0/1000
Progress: 10/1000
Progress: 20/1000
Progress: 30/1000
Progress: 40/1000
Progress: 50/1000
Progress: 60/1000
Progress: 70/1000
Progress: 80/1000
Progress: 90/1000
Progress: 100/1000
Progress: 110/1000
Progress: 120/1000
Progress: 130/1000
Progress: 140/1000
Progress: 150/1000
Progress: 160/1000
Progress: 170/1000
Progress: 180/1000
Progress: 190/1000
Progress: 200/1000
Progress: 210/1000
Progress: 220/1000
Progress: 230/1000
Progress: 240/1000
Progress: 250/1000
Progress: 260/1000
Progress: 270/1000
Progress: 280/1000
Progress: 290/1000
Progress: 300/1000
Progress: 310/1000
Progress: 320/1000
Progress: 330/1000
Progress: 340/1000
Progress: 350/1000
Progress: 360/1000
Progress: 370/1000
Progress: 380/1000
Progress: 390/1000
Progress: 400/1000
Progress: 410/1000
Progress: 420/1000
Progress: 430/1000
Progress: 440/1000
Progress: 450/1000
Progress: 460/1000
Progress: 470/1000
Progress: 480/1000
Progress: 490/1000
Progress: 500/1000
Progress: 510/1000
Progress: 520/1000
Prog

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Scientist,Employer Provided Salary:$85K - $205K,"At PayPal (NASDAQ: PYPL), we believe that ever...",3.9,PayPal\n3.9,"New York, NY",10000+ Employees,1998,Company - Public,Internet & Web Services,Information Technology,$10+ billion (USD)
1,Junior Data Scientist/Data Modeler,-1,Overview:\nLMI is seeking a skilled Data Scien...,4.3,Logistics Management Institute\n4.3,Remote,1001 to 5000 Employees,1961,Company - Private,Business Consulting,Management & Consulting,$100 to $500 million (USD)
2,Sr Tableau Data Analyst,Employer Provided Salary:$50.00 - $60.00 Per Hour,Performs complex (journey-level) data analysis...,4.4,Dutech\n4.4,"Austin, TX",51 to 200 Employees,-1,Government,-1,-1,Unknown / Non-Applicable
3,Data Scientist,-1,Company Overview\nIdea Evolver specializes in ...,4.1,Idea Evolver\n4.1,Remote,1 to 50 Employees,2011,Company - Private,Advertising & Public Relations,Media & Communication,Unknown / Non-Applicable
4,"Senior Data Scientist, Ops Data Platform",Employer Provided Salary:$160K - $247K,The NVIDIA Datacenter organization is seeking ...,4.6,NVIDIA\n4.6,"Santa Clara, CA",10000+ Employees,1993,Company - Public,Computer Hardware Development,Information Technology,$5 to $10 billion (USD)
...,...,...,...,...,...,...,...,...,...,...,...,...
995,"Senior Data Scientist, Ops Data Platform",Employer Provided Salary:$160K - $247K,The NVIDIA Datacenter organization is seeking ...,4.6,NVIDIA\n4.6,"Santa Clara, CA",10000+ Employees,1993,Company - Public,Computer Hardware Development,Information Technology,$5 to $10 billion (USD)
996,Data Scientist,Employer Provided Salary:$85K - $205K,"At PayPal (NASDAQ: PYPL), we believe that ever...",3.9,PayPal\n3.9,"New York, NY",10000+ Employees,1998,Company - Public,Internet & Web Services,Information Technology,$10+ billion (USD)
997,Staff Data Scientist,Employer Provided Salary:$185K - $215K,Staff Data Scientist\nWe’re looking for Data S...,3.6,Honor\n3.6,Remote,501 to 1000 Employees,2014,Company - Private,Health Care Services & Hospitals,Healthcare,Unknown / Non-Applicable
998,"Research Scientist, Machine Learning",Employer Provided Salary:$200K - $240K,"At TeachFX, we're using AI to show educators p...",-1,TeachFX,California,1 to 50 Employees,2017,Company - Private,Primary & Secondary Schools,Education,Unknown / Non-Applicable


In [6]:
ds_df.to_csv(r'..\Data\Raw\raw_data_ds2.csv')

In [None]:
ml_url = 'https://www.glassdoor.com/Job/machine-learning-jobs-SRCH_KO0,16.htm?includeNoSalaryJobs=false'
ml_df = get_jobs(ml_url, 1000, False)
ml_df

In [None]:
ml_df.to_csv(r'..\Data\Raw\raw_data_ml.csv')

In [None]:
da_url = "https://www.glassdoor.com/Job/data-analyst-jobs-SRCH_KO0,12.htm?includeNoSalaryJobs=false"
da_df = get_jobs(da_url, 1000, False)
da_df

In [9]:
de_url = "https://www.glassdoor.com/Job/data-engineer-jobs-SRCH_KO0,13.htm?includeNoSalaryJobs=false"
de_df = get_jobs(de_url, 450, False)
de_df

Progress: 0/450
Progress: 10/450
Progress: 20/450
Progress: 30/450
Progress: 40/450
Progress: 50/450
Progress: 60/450
Progress: 70/450
Progress: 80/450
Progress: 90/450
Progress: 100/450
Progress: 110/450
Progress: 120/450
Progress: 130/450
Progress: 140/450
Progress: 150/450
Progress: 160/450
Progress: 170/450
Progress: 180/450
Progress: 190/450
Progress: 200/450
Progress: 210/450
Progress: 220/450
Progress: 230/450
Progress: 240/450
Progress: 250/450
Progress: 260/450
Progress: 270/450
Progress: 280/450
Progress: 290/450
Progress: 300/450
Progress: 310/450
Progress: 320/450
Progress: 330/450
Progress: 340/450
Progress: 350/450
Progress: 360/450
Progress: 370/450
Progress: 380/450
Progress: 390/450
Progress: 400/450
Progress: 410/450
Progress: 420/450
Progress: 430/450
Progress: 440/450


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Engineer,Employer Provided Salary:$175K - $200K,At Expa we're dedicated to being the best plac...,3.7,Expa\n3.7,"Los Angeles, CA",1 to 50 Employees,2013,Company - Private,Business Consulting,Management & Consulting,Unknown / Non-Applicable
1,Sr. Data Engineer,Employer Provided Salary:$118K - $142K,Data Engineer\nEgen Solutions Inc offers compe...,-1,Egen Solutions Inc,"Naperville, IL",-1,-1,-1,-1,-1,-1
2,Data Engineer (Remote),-1,"At Bluesight, our mission is to create groundb...",4.1,Kitcheck\n4.1,Remote,51 to 200 Employees,2013,Company - Private,Computer Hardware Development,Information Technology,$25 to $100 million (USD)
3,Senior Lead Data Engineer,Employer Provided Salary:$75.00 Per Hour,"Hello,\nHope you are doing well!!!\nWe are con...",4.0,New York Technology Partners\n4.0,"Malvern, PA",51 to 200 Employees,1999,Company - Private,Computer Hardware Development,Information Technology,$25 to $100 million (USD)
4,Data Devops Engineer,-1,"Overview\nHealth by Design (HBD), a well-respe...",3.3,Medici\n3.3,Kentucky,1 to 50 Employees,2016,Company - Private,Software Development,Information Technology,Unknown / Non-Applicable
...,...,...,...,...,...,...,...,...,...,...,...,...
445,Data Engineer IV,Employer Provided Salary:$126K - $180K,"Your Opportunity\n\nAt Schwab, you’re empowere...",3.8,Charles Schwab\n3.8,"Westlake, TX",10000+ Employees,1973,Company - Public,Investment & Asset Management,Financial Services,$10+ billion (USD)
446,Data Engineer,Employer Provided Salary:$70.00 - $75.00 Per Hour,Data Engineer\nCandidate must have AWS (Redshi...,-1,PRISM IT LLC,"Austin, TX",-1,-1,-1,-1,-1,-1
447,Data Engineer,-1,Position Purpose\nWithin the Reporting and Ana...,2.8,Michigan Health Information Network\n2.8,Remote,201 to 500 Employees,2010,Nonprofit Organization,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
448,USA - Infrastructure Data Engineer (AWS),$104K - $148K (Glassdoor est.),Job Title: Infrastructure Data Engineer (AWS)\...,-1,Avestacs,"San Jose, CA",51 to 200 Employees,-1,Company - Public,-1,-1,Unknown / Non-Applicable


In [10]:
de_url = "https://www.glassdoor.com/Job/data-engineer-jobs-SRCH_KO0,13_IP16.htm?includeNoSalaryJobs=false"
de_df1 = get_jobs(de_url, 550, False)
de_df1

Progress: 0/550
Progress: 10/550
Progress: 20/550
Progress: 30/550
Progress: 40/550
Progress: 50/550
Progress: 60/550
Progress: 70/550
Progress: 80/550
Progress: 90/550
Progress: 100/550
Progress: 110/550
Progress: 120/550
Progress: 130/550
Progress: 140/550
Progress: 150/550
Progress: 160/550
Progress: 170/550
Progress: 180/550
Progress: 190/550
Progress: 200/550
Progress: 210/550
Progress: 220/550
Progress: 230/550
Progress: 240/550
Progress: 250/550
Progress: 260/550
Progress: 270/550
Progress: 280/550
Progress: 290/550
Progress: 300/550
Progress: 310/550
Progress: 320/550
Progress: 330/550
Progress: 340/550
Progress: 350/550
Progress: 360/550
Progress: 370/550
Progress: 380/550
Progress: 390/550
Progress: 400/550
Progress: 410/550
Progress: 420/550
Progress: 430/550
Progress: 440/550
Progress: 450/550
Progress: 460/550
Progress: 470/550
Progress: 480/550
Progress: 490/550
Progress: 500/550
Progress: 510/550
Progress: 520/550
Progress: 530/550
Progress: 540/550
Progress: 550/550


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,"Data Engineer (AWS, Python)",Employer Provided Salary:$50.00 - $52.00 Per Hour,"Title: Data Engineer (AWS, Python)\nLocation: ...",5.0,DSMH LLC\n5.0,Remote,1 to 50 Employees,-1,Company - Private,-1,-1,$1 to $5 million (USD)
1,Data Analytics Engineer,Employer Provided Salary:$60.00 Per Hour,JD and Details\nCurrent location: Santa Clara ...,-1,Proits Hub LLC,Remote,1 to 50 Employees,-1,Company - Public,-1,-1,Unknown / Non-Applicable
2,Data Engineer (Remote),-1,"At Bluesight, our mission is to create groundb...",4.1,Kitcheck\n4.1,Remote,51 to 200 Employees,2013,Company - Private,Computer Hardware Development,Information Technology,$25 to $100 million (USD)
3,Junior Data Engineer,$73K - $107K (Glassdoor est.),Job Title - Junior Data Engineer\nLocation - D...,4.7,HYR Global Source Inc\n4.7,"Dallas, TX",51 to 200 Employees,2013,Company - Private,Information Technology Support Services,Information Technology,Unknown / Non-Applicable
4,Informatica Data Engineer,Employer Provided Salary:$50.00 Per Hour,Description\n(Candidates MUST HAVE: Informatic...,4.1,Abotts consulting\n4.1,Remote,51 to 200 Employees,2014,Company - Private,Business Consulting,Management & Consulting,$1 to $5 million (USD)
...,...,...,...,...,...,...,...,...,...,...,...,...
545,AI/ML Data Automation Engineer,-1,"At PayPal (NASDAQ: PYPL), we believe that ever...",3.9,PayPal\n3.9,"Scottsdale, AZ",10000+ Employees,1998,Company - Public,Internet & Web Services,Information Technology,$10+ billion (USD)
546,Data Engineer,Employer Provided Salary:$120K - $150K,"At Dynatron Software, we help automotive servi...",4.3,Dynatron Software\n4.3,Remote,51 to 200 Employees,1997,Company - Private,Enterprise Software & Network Solutions,Information Technology,$25 to $100 million (USD)
547,Data Engineer,Employer Provided Salary:$113K,Department: Technology\n\nOur Company Promise\...,4.1,Southwest Airlines\n4.1,"Dallas, TX",10000+ Employees,1967,Company - Public,"Airlines, Airports & Air Transportation",Transportation & Logistics,$10+ billion (USD)
548,Sr. Azure Data Engineer,Employer Provided Salary:$60.00 - $70.00 Per Hour,Role: Sr. Azure Data Engineer\nJob Location: R...,3.3,Iron Service Global Inc\n3.3,Remote,201 to 500 Employees,1987,Company - Private,Information Technology Support Services,Information Technology,$25 to $100 million (USD)


In [11]:
de_df= pd.concat([de_df, de_df1],ignore_index=True, sort=False)

In [12]:
de_df.to_csv(r'..\Data\Raw\raw_data_de1.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
for i in range(25,34,3):
    scien_url = "https://www.glassdoor.com/Job/data-scientist-jobs-SRCH_KO0,14_ip"+str(i)+".htm"
    scien_df = get_jobs(scien_url, 90, False)
    scien_df.to_csv(r'..\Data\Raw\scien'+str(i)+'.csv')

In [10]:
import glob
csv_files = glob.glob('../Data/Raw/' + "/scien*")
data_scientist_df = pd.concat(map(pd.read_csv, csv_files))
data_scientist_df = data_scientist_df.set_index('Unnamed: 0').reset_index(drop=True)

In [12]:
data_scientist_df.to_csv(r'..\Data\Raw\ds_last.csv')

In [None]:
#This line will open a new chrome window and start the scraping.
for i in range(10,34,3):
    engineer_url = "https://www.glassdoor.com/Job/data-engineer-jobs-SRCH_KO0,14_ip"+str(i)+".htm"
    engineer_df = get_jobs(engineer_url, 90, False)
    engineer_df.to_csv(r'..\Data\Raw\engineer'+str(i)+'.csv')

In [13]:
import glob
csv_files = glob.glob('../Data/Raw/' + "/engineer*")
data_engineer_df = pd.concat(map(pd.read_csv, csv_files))
data_engineer_df = data_engineer_df.set_index('Unnamed: 0').reset_index(drop=True)

In [14]:
data_engineer_df.to_csv(r'..\Data\Raw\de_last.csv')

In [None]:
test = data_engineer_df.drop_duplicates()
len(test)

242

In [None]:
#This line will open a new chrome window and start the scraping.
for i in range(10,30,3):
    mle_url = "https://www.glassdoor.com/Job/machine-learning-jobs-SRCH_KO0,16_ip"+str(i)+".htm"
    mle_df = get_jobs(mle_url, 90, False)
    mle_df.to_csv(r'..\Data\Raw\mle'+str(i)+'.csv')

In [4]:
import glob
csv_files = glob.glob('../Data/Raw/' + "/mle*")
mle_df = pd.concat(map(pd.read_csv, csv_files))
mle_df = mle_df.set_index('Unnamed: 0').reset_index(drop=True)

In [5]:
mle_df.to_csv(r'..\Data\Raw\mle_last.csv')

In [None]:
test = mle_df.drop_duplicates()
len(test)

223

In [None]:
#This line will open a new chrome window and start the scraping.
for i in range(28,30,3):
    analys_url = "https://www.glassdoor.com/Job/data-analyst-jobs-SRCH_KO0,12_ip"+str(i)+".htm"
    analys_df = get_jobs(analys_url, 90, False)
    analys_df.to_csv(r'..\Data\Raw\analys'+str(i)+'.csv')

In [7]:
import glob
csv_files = glob.glob('../Data/Raw/' + "/analys*")
data_analys_df = pd.concat(map(pd.read_csv, csv_files))
data_analys_df = data_analys_df.set_index('Unnamed: 0').reset_index(drop=True)

In [8]:
data_analys_df.to_csv(r'..\Data\Raw\analys_last.csv')

In [None]:
test = data_analys_df.drop_duplicates()
len(test)

215

In [22]:
import glob
csv_files = glob.glob('../Data/Raw/' +'/df_*')
raw_df1 = pd.concat(map(pd.read_csv, csv_files))
raw_df1 = raw_df1.set_index('Unnamed: 0').reset_index(drop=True)

In [23]:
test = raw_df1.drop_duplicates()
len(test)

825

In [24]:
import glob
csv_files = glob.glob('../Data/Raw/' +'/raw_data*')
raw_df2 = pd.concat(map(pd.read_csv, csv_files))
raw_df2 = raw_df2.set_index('Unnamed: 0').reset_index(drop=True)

In [25]:
test = raw_df2.drop_duplicates()
len(test)

1022

In [27]:
raw_df = pd.concat([raw_df1,raw_df2],ignore_index=True)
len(raw_df)

11960

In [30]:
raw_df_unique=raw_df.drop_duplicates()
len(raw_df_unique)

1542

In [32]:
# finsihed data scraping with 1542 unique jobs from 12k jobs
# maybe this a bug in the glassdoor website or they can detect the scrapper 
raw_df.to_csv(r'..\Data\Raw\raw_df.csv')
raw_df_unique.to_csv(r'..\Data\Raw\raw_df_unique.csv')