In [2]:
# necessary for scraping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
import pandas as pd
import numpy as np
import time
import random

In [3]:
# scraping url for indeed search : data science job in Vancouver BC , radius 50km 
url = 'https://ca.indeed.com/jobs?q=data+science&l=Vancouver%2C+BC&radius=50&sort=date&start='

In [4]:
# function to scrape job on indeed given url
def indeed_scrape(url):
    # set up web browser for scraping
    options = webdriver.ChromeOptions()
    driver = webdriver.Chrome(  
        options=options
    )
    driver.get(url)
    driver.maximize_window()
    #  collect all job listing elements
    job_cards = driver.find_elements(By.CSS_SELECTOR, ".cardOutline")
    # create list to store job listing
    jobs = []
    # create variable to store data in job posting
    date = None
    applications = None
    job_title = None
    company_name = None
    company_reviews = None
    location = None
    pay = None
    job_type = None
    benefits = None
    job_description = None
    # iterate through job listing elements
    for job_card in job_cards:
        # extract job date posted
        try:
            date_element = job_card.find_element(By.CSS_SELECTOR, ".date")
            date = date_element.text
        except NoSuchElementException: pass
        # close pop-up dialog
        try:
            dialog_element = driver.find_element(By.CSS_SELECTOR, "[role='dialog']")
            close_button = dialog_element.find_element(By.CSS_SELECTOR, "[aria-label=close][type='button']")
            close_button.click()
        except NoSuchElementException: pass
        # click job listing
        job_card.click()
        # wait for listing to load by check if job title element exists
        # extract job titles
        try:
            title_element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, ".jobsearch-JobInfoHeader-title")))
            job_title = title_element.text.replace("\n- job post", "")
        except NoSuchElementException: pass
        # navigate to detail job listing panel
        job_details_element = driver.find_element(By.CSS_SELECTOR, ".jobsearch-RightPane")
        # extract company name, review, location
        company_info_element = job_details_element.find_element(By.CSS_SELECTOR, "[data-testid='jobsearch-CompanyInfoContainer']")
        company_info = company_info_element.text.split("\n")
        company_name = company_info[0]
        if len(company_info) == 2:
            company_location = company_info[1]
        if len(company_info) == 3:
            company_reviews = company_info[1]
            company_location = company_info[2]
        # extract salary and job type
        for div in job_details_element.find_elements(By.CSS_SELECTOR, "#jobDetailsSection div"):
            if div.text == "Pay":
                pay_element = div.find_element(By.XPATH, "following-sibling::*")
                pay = pay_element.text
            elif div.text == "Job type":
                job_type_element = div.find_element(By.XPATH, "following-sibling::*")
                job_type = job_type_element.text
        # extract benefits
        try:
            benefits_element = job_details_element.find_element(By.ID, "benefits")
            benefits = []
            for benefit_element in benefits_element.find_elements(By.TAG_NAME, "li"):
                benefit = benefit_element.text
                benefits.append(benefit)
        except NoSuchElementException: pass
        try:
            description_element = job_details_element.find_element(By.ID, "jobDescriptionText")
            job_description = description_element.text
        except NoSuchElementException: pass
        # create dictionary to store data for job posting
        job_posting = {}
        job_posting["date"] = date
        job_posting["job_title"] = job_title 
        job_posting["company_name"] = company_name
        job_posting["company_reviews"] = company_reviews
        job_posting["company_location"] = company_location
        job_posting["pay"] = pay
        job_posting["job_type"] = job_type
        job_posting["benefits"] = benefits
        job_posting["job_description"] = job_description
        jobs.append(job_posting)
        time.sleep(random.uniform(1, 5))
    # close the browser
    driver.quit()
    # return dictionary of job postings
    return jobs
    print("Scraping done!!")

In [5]:
# scrape data for first 5 pages of indeed
job_df = pd.DataFrame()
for page in ['0','10','20','30','40','50']:
    job_df  = pd.concat([job_df,pd.DataFrame(indeed_scrape(url+page))], ignore_index=True)

In [6]:
# export data table to csv file
job_df.to_csv("indeed_jobs.csv")
job_df

Unnamed: 0,date,job_title,company_name,company_reviews,company_location,pay,job_type,benefits,job_description
0,Posted\nPosted 1 day ago,Senior Data Analyst [BHJOB13022_12997],Ignite Technical Resources.,2 reviews,"Richmond, BC",,,,"On behalf of our Healthcare client, Ignite Tec..."
1,Posted\nPosted 2 days ago,Data Scientist,University of British Columbia,598 reviews,"Vancouver, BC","$6,068.92–$8,724.25 a month",Full-time,,Staff - Non Union\nJob Category\nM&P - AAPS\nJ...
2,Posted\nPosted 3 days ago,Senior Financial Data Analyst (Adaptive Insigh...,Central 1 Credit Union,41 reviews,"Vancouver, BC•Hybrid remote","$96,300 a year",Full-time,"[Paid time off, Tuition reimbursement]",Central 1 cooperatively empowers credit unions...
3,Posted\nPosted 3 days ago,"Lead Data Engineer, AI",Recruiting From Scratch,41 reviews,"Vancouver, BC•Remote","$160,000–$280,000 a year",Full-time,"[Dental care, Vision care]",This is for a client of Recruiting from Scratc...
4,Posted\nPosted 4 days ago,Junior Data Scientist,Theory and Practice,41 reviews,"Vancouver, BC•Hybrid remote","$160,000–$280,000 a year",Full-time,"[Dental care, Vision care]",Description\nThe Junior Data Scientist will jo...
...,...,...,...,...,...,...,...,...,...
85,Posted\nPosted 30+ days ago,"Senior Data Analyst, Competitive Intelligence",Klue,13 reviews,"Vancouver, BC•Hybrid remote","$90,000–$125,000 a year",Full-time,"[Dental care, Extended health care, Paid time ...",\uD83D\uDC4B You found us. Awesome. Something ...
86,Employer\nActive 5 days ago,Senior Data Analyst - FC Live,Electronic Arts,599 reviews,"Vancouver, BC","$77,600–$130,500 a year",Full-time,"[Dental care, Disability insurance, Extended h...","At EA, we inspire the world to play by develop..."
87,Posted\nPosted 30+ days ago,Assistant Professor of Teaching in Biostatisti...,University of British Columbia,598 reviews,"Vancouver, BC","$77,600–$130,500 a year",Full-time,"[Dental care, Disability insurance, Extended h...",Academic\nJob Category\nFaculty Bargaining\nJo...
88,Posted\nPosted 30+ days ago,"Manager, Cyber Analytics, Intelligence & Repor...",Teck Resources Limited,481 reviews,"Vancouver, BC","$141,000–$175,000 a year",Full-time,"[Dental care, Disability insurance, Employee a...","Summary\n\nReporting to the Director, Digital ..."
