# How to Web Scrape [DataJobs.com](https://datajobs.com/)

<img src="BLOG/DataJObs_Header.png">

This notebook will take you through how to scrape job entries from DataJobs.com! This serves as a guide informing the larger scraper that incorporates jobs from [Indeed.com](https://www.indeed.com/).

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import Chrome
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service

import pandas as pd
import numpy as np
import regex as re

In [7]:
# enables automatic installation of chrome drivers
service = Service(ChromeDriverManager().install())
# set up chrome driver
driver = Chrome(service=service)

# navigate to DataJobs.com
site_url = "https://datajob.com/"
driver.get(site_url)

In [10]:
wait_time = 3
dsa_jobs_list = WebDriverWait(driver, wait_time).until(
    EC.element_to_be_clickable(
        (By.XPATH, "//a[contains(text(), 'Data Science Jobs / Analytics')]")
    )
)
dsa_jobs_list.click()

In [None]:
board_paths = ["/Data-Science-Jobs", "/Data-Engineering-Jobs"]
# loop through the boards available
for bp in board_paths:
    if bp == "/Data-Science-Jobs":
        cat = "Data Science & Analytics"
    else:
        cat = "Data Engineering"
    # load into the webpage
    self._driver.get(self._site_url + bp)
    more_pages = True  # will kill the loop when there are no more pages
    i = 0  # just a counter to kill the loop just in case
    while more_pages:
        # grab page source html
        page_html = self._driver.page_source

        # grab job info
        fall = re.findall(dj_pattern, page_html)
        
        # zip the info into a dict for easy DataFrame-ability
        fall_cols = [
            dict(
                zip(
                    self.job_meta.columns,
                    (
                        (
                            y.replace("&amp;", "&") # this removes some HTML stuff to not confuse the CSV format
                            .replace("&amp,", "&")
                            .replace("&nbsp;", " ")
                            .replace("&nbsp,", " ")
                            if type(y) == str
                            else y
                        )
                        for y in x
                    )
                    + (cat,),
                )
            )
            for x in fall
        ]
        # add to dataframe
        self.job_meta = pd.concat(
            [self.job_meta, pd.DataFrame(fall_cols)], ignore_index=True
        )

        if i == 300:
            # stop after 300 pages
            more_pages = False

        # try to go to next page
        try:
            next_page = WebDriverWait(self._driver, wait_time).until(
                EC.element_to_be_clickable(
                    (By.XPATH, "//a[contains(text(), 'NEXT PAGE')]")
                )
            )
            next_page.click()
            i += 1
        except:
            logging.info(f"END OF SEARCH RESULTS: {self._site_url} || {bp}")
            more_pages = False

In [None]:
 # scrape the job description from the job posting

# list to hold description text
job_desc_list = []
for _, job in self.job_meta.iterrows():
    # set up the URL so the driver can navigate there
    job_url = self._site_url + job["url"][1:]
    # navigate to the job posting
    self._driver.get(job_url)
    # grab job desc element
    try:
        job_descr = WebDriverWait(self._driver, wait_time).until(
            EC.element_to_be_clickable(
                (
                    By.XPATH,
                    "//div[@id='job_description']//*[@class='jobpost-table-cell-2']",
                )
            )
        )
    except:
        logging.error(f"I can't find this job: {job['title']} || {self._site_url}")
        continue

    # get html
    job_desc_clean = (
        cleanhtml(job_descr.get_attribute("innerHTML"))
        .replace("&amp;", "&") # this removes some HTML stuff to not confuse the CSV format
        .replace("&amp,", "&")
        .replace("&nbsp;", " ")
        .replace("&nbsp,", " ")
    )
    job_desc_list.append(
        {
            "job_id": job["job_id"],
            "title": job["title"],
            "company": job["company"],
            "desc": job_desc_clean,
        }
    )
return job_desc_list
