In [20]:
from dotenv import load_dotenv
import os
import psycopg2
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.webdriver.chrome.service import Service
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import pandas as pd
import numpy as np

# Load environment variables from .env
load_dotenv()


True

In [21]:
def get_jobs(keyword, num_jobs, verbose):
    '''Gathers jobs as a dataframe, scraped from Glassdoor'''

    # Initializing the webdriver
    service = Service(executable_path='./chromedriver.exe')
    options = webdriver.ChromeOptions()
    
    # Uncomment the line below if you'd like to scrape without a new Chrome window every time.
    # options.add_argument('headless')
    
    # Change the path to where chromedriver is in your home folder.
    driver = webdriver.Chrome(options=options, service=service)
    driver.set_window_size(1120, 1000)

    url = f'https://www.glassdoor.com/Job/jobs.htm?sc.keyword="{keyword}"&sc.locationSeoString=Riyadh+%28Saudi+Arabia%29&locId=3110290&locT=C'
    driver.get(url)
    jobs = []

    while len(jobs) < num_jobs:  # If true, should be still looking for new jobs.

        time.sleep(4)  # Let the page load

        # Test for the "Sign Up" prompt and get rid of it.
        try:
            driver.find_element(By.CLASS_NAME, "selected").click()
        except:
            pass

        try:
            driver.find_element(By.CLASS_NAME, "ModalStyle__xBtn___29PT9").click()  # Close the sign-up modal
        except:
            pass

        # Going through each job in this page
        job_cards = driver.find_elements(By.CLASS_NAME, "jobCard")  # Updated to match the current HTML structure
        
        for job_card in job_cards:  
            print(f"Progress: {len(jobs)}/{num_jobs}")
            if len(jobs) >= num_jobs:
                break

            try:
                job_title = job_card.find_element(By.CLASS_NAME, "JobCard_jobTitle___7I6y").text
                company_name = job_card.find_element(By.CLASS_NAME, "EmployerProfile_compactEmployerName__LE242").text
                location = job_card.find_element(By.CLASS_NAME, "JobCard_location__rCz3x").text
                job_description = job_card.find_element(By.CLASS_NAME, "JobCard_jobDescriptionSnippet__yWW8q").text
            except Exception as e:
                print(f"Failed to collect job data: {e}")
                continue

            try:
                salary_estimate = job_card.find_element(By.CLASS_NAME, "JobCard_salaryEstimate__arV5J").text
            except NoSuchElementException:
                salary_estimate = -1

            try:
                rating = job_card.find_element(By.CLASS_NAME, "EmployerProfile_ratingContainer__ul0Ef").text
            except NoSuchElementException:
                rating = -1

            # Printing for debugging
            if verbose:
                print(f"Job Title: {job_title}")
                print(f"Salary Estimate: {salary_estimate}")
                print(f"Job Description: {job_description[:500]}")
                print(f"Rating: {rating}")
                print(f"Company Name: {company_name}")
                print(f"Location: {location}")

            jobs.append({
                "Job Title": job_title,
                "Salary Estimate": salary_estimate,
                "Job Description": job_description,
                "Rating": rating,
                "Company Name": company_name,
                "Location": location
            })

        # Clicking on the "next page" button
        try:
            driver.find_element(By.XPATH, './/li[@class="next"]//a').click()
        except NoSuchElementException:
            print(f"Scraping terminated before reaching target number of jobs. Needed {num_jobs}, got {len(jobs)}.")
            break

    driver.quit()
    return pd.DataFrame(jobs)


In [22]:
df = get_jobs("data scientist", 20, False)
df


Progress: 0/20
Progress: 1/20
Progress: 2/20
Progress: 3/20
Progress: 4/20
Progress: 5/20
Progress: 6/20
Progress: 7/20
Progress: 8/20
Progress: 9/20
Scraping terminated before reaching target number of jobs. Needed 20, got 10.


Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location
0,Data Scientist - AI & Computer Vision Specialist,-1,We are looking for an innovative and experienc...,-1.0,Ines Partners,Riyadh
1,Senior Data Scientist,-1,The ideal candidate will have a strong focus o...,-1.0,Ines Partners,Riyadh
2,Data Scientist Lead,-1,Develop and maintain best practices for data s...,4.1,Giza Systems EG,Riyadh
3,Data Scientist,-1,Present information using data visualization t...,2.9,Arabic Computer Systems,Riyadh
4,Senior Data Scientist,-1,"Relevant experience as a data scientist, or in...",3.9,Mozn,Riyadh
5,Data Exchange and Data Science Specialist,-1,Implement data quality checks and data validat...,3.4,Valleysoft,Riyadh
6,Sales Director- Media Vertical,-1,Advanced Excel and analytical skills are essen...,3.3,ArabyAds,Riyadh
7,PowerBI engineer,-1, Background in data warehouse design (e.g. di...,2.9,Arabic Computer Systems,Riyadh
8,Sales Manager- Media Vertical,-1,Advanced Excel and analytical skills are essen...,3.3,ArabyAds,Riyadh
9,"AI/ML Sales Specialist, MENAT AGS Specialist Team",-1,You conduct compelling executive conversations...,3.7,AWS EMEA SARL (Saudi Arabia Branch),Riyadh


In [23]:

def connect_db():
    print(os.getenv('DATABASE_URL'))
    return psycopg2.connect(os.getenv('DATABASE_URL'))


def convert_numpy_types(series):
    return series.map(lambda x: x.item() if isinstance(x, np.generic) else x)


In [26]:
from psycopg2.extras import execute_values

# Function to insert job data into PostgreSQL
def insert_jobs_to_db(jobs_df):
    # Convert numpy types to native Python types for each column
    for col in jobs_df.columns:
        jobs_df[col] = convert_numpy_types(jobs_df[col])
    
    # Establish the connection
    conn = connect_db()
    cursor = conn.cursor()
    
    # Define the insert query
    insert_query = """
    INSERT INTO job_listings (job_title, salary_estimate, job_description, rating, company_name, location)
    VALUES (%s, %s, %s, %s, %s, %s)
    ON CONFLICT (job_title, company_name, location) DO NOTHING;
    """
    
    # Insert each row into the database
    for row in jobs_df.itertuples(index=False):
        print(f"Inserting row: {row[0]}")
        cursor.execute(insert_query, row)
    
    # Commit and close the connection
    conn.commit()
    cursor.close()
    conn.close()

insert_jobs_to_db(df)


postgres://scraper_user:scraper_pass@localhost:5432/job_scraper
Inserting row: Data Scientist - AI & Computer Vision Specialist
Inserting row: Senior Data Scientist
Inserting row: Data Scientist Lead
Inserting row: Data Scientist
Inserting row: Senior Data Scientist
Inserting row: Data Exchange and Data Science Specialist
Inserting row: Sales Director- Media Vertical
Inserting row: PowerBI engineer
Inserting row: Sales Manager- Media Vertical
Inserting row: AI/ML Sales Specialist, MENAT AGS Specialist Team
