In [None]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.edge.service import Service
from selenium.webdriver.edge.options import Options
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException, ElementNotInteractableException
import pandas as pd
import time

def get_jobs(keyword, num_jobs, verbose):
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''
    
    # Initializing the webdriver
    options = Options()
    
    # Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    # options.add_argument('headless')
    
    # Change the path to where chromedriver is in your home folder.
    chrome_path = "msedgedriver.exe"
    service = Service(chrome_path)
    driver = webdriver.Edge(service=service, options=options)
    driver.set_window_size(1120, 1000)
    char = str(len(keyword))
    url = 'https://www.glassdoor.com/Job/' + keyword + '-jobs-SRCH_KO0,' + char + '.htm'
    driver.get(url)
    jobs = []
    time.sleep(5)
    processed = set()
    while len(jobs) < num_jobs:
        # Going through each job in this page
        try:
            job_cards = driver.find_elements(By.CLASS_NAME, 'JobCard_jobCardContainer___hKKI')
            print("Found job cards:", len(job_cards))
        except:
            continue
            pass
        
        print("Progress: {}".format("" + str(len(jobs)) + "/" + str(num_jobs)))
        if len(jobs) >= num_jobs:
            break
        
        for job_card in job_cards:
            if len(jobs)>=num_jobs:
                return pd.DataFrame(jobs)
            try:
                driver.find_element(By.XPATH, "/html/body/div[11]/div[2]/div[2]/div[1]/div[1]/button").click()  # clicking to the X.
                print("clicked the cross")
            except NoSuchElementException:
                pass
            job_url = job_card.find_element(By.CLASS_NAME, 'JobCard_jobTitle___7I6y').get_attribute('href')
            if job_url not in processed:
                try:
                    job_card.click()
                    time.sleep(2)
                    collected_successfully = False
                    while not collected_successfully:
                        try:
                            company_name = job_card.find_element(By.CLASS_NAME, 'EmployerProfile_compactEmployerName__LE242').text
                            location = job_card.find_element(By.CLASS_NAME, 'JobCard_location__rCz3x').text
                            job_title = job_card.find_element(By.CLASS_NAME, 'JobCard_jobTitle___7I6y').text
                            job_description = job_card.find_element(By.CLASS_NAME, 'JobCard_jobDescriptionSnippet__yWW8q').text
                            collected_successfully = True
                            processed.add(job_url)
                        except NoSuchElementException:
                            time.sleep(5)
                    try:
                        salary_estimate = job_card.find_element(By.CLASS_NAME, 'JobCard_salaryEstimate__arV5J').text
                    except NoSuchElementException:
                        salary_estimate = -1  # You need to set a "not found value. It's important."
                
                    try:
                        rating = job_card.find_element(By.CLASS_NAME, 'EmployerProfile_ratingContainer__ul0Ef').text
                    except NoSuchElementException:
                        rating = -1  # You need to set a "not found value. It's important."
                    
                    # Printing for debugging
                    if verbose:
                        print("Job Title: {}".format(job_title))
                        print("Salary Estimate: {}".format(salary_estimate))
                        print("Job Description: {}".format(job_description[:500]))
                        print("Rating: {}".format(rating))
                        print("Company Name: {}".format(company_name))
                        print("Location: {}".format(location))
                    
                    # Going to the Company tab...
                    time.sleep(4)
                    j=1
                    try:
                        size = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        size = -1
                    try:
                        founded = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        founded = -1
                    try:
                        type_of_ownership = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        type_of_ownership = -1
                    try:
                        industry = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        industry = -1
                    try:
                        sector = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        sector = -1
                    try:
                        revenue = driver.find_element(By.XPATH,'(//div[@class="JobDetails_overviewItemValue__xn8EF"])['+str(j)+']').text
                        j+=1
                    except NoSuchElementException:
                        revenue = -1
                    
                    if verbose:
                        print("Size: {}".format(size))
                        print("Founded: {}".format(founded))
                        print("Type of Ownership: {}".format(type_of_ownership))
                        print("Industry: {}".format(industry))
                        print("Sector: {}".format(sector))
                        print("Revenue: {}".format(revenue))
                        print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")
                    
                    jobs.append({
                        "Job Title": job_title,
                        "Salary Estimate": salary_estimate,
                        "Job Description": job_description,
                        "Rating": rating,
                        "Company Name": company_name,
                        "Location": location,
                        "Size": size,
                        "Founded": founded,
                        "Type of ownership": type_of_ownership,
                        "Industry": industry,
                        "Sector": sector,
                        "Revenue": revenue,
                        "URL": job_url,
                    })
                except ElementClickInterceptedException:
                    print("Error clicking on job card")
                    continue
                except ElementNotInteractableException:
                    print("Job card not interactable")
                    continue
        try:
            driver.find_element(By.XPATH, '(//button[@data-test="load-more"]').click()
            time.sleep(3)
        except:
            print("Scraping terminated before reaching target number of jobs. Needed {}, got {}.".format(num_jobs, len(jobs)))
            break
    
    driver.quit()
    return pd.DataFrame(jobs)  # This line converts the dictionary object into a pandas DataFrame.

# Example usage:
df = get_jobs(input("Enter the keyword you want to search"), int(input("Enter the number of jobs you want")), bool(input("Do you want the output to be displayed simultaneously")))


In [None]:
df.to_csv('Jobs.csv')