# Scraping Job Listings from Monster.com, separated by type of job:

## Imports

In [5]:
from selenium import webdriver 
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import Remote, ChromeOptions  
from selenium.webdriver.chromium.remote_connection import ChromiumRemoteConnection  
from selenium_stealth import stealth

from dotenv import load_dotenv
import os
from bs4 import BeautifulSoup
import random
import time
import pandas as pd



In [6]:
import warnings

warnings.filterwarnings('ignore')

## All the job types we want to scrape

In [7]:
base_url = 'https://www.monster.com/jobs'
job_type_names = [  'accounting',
                    'administration',
                    'banking',
                    'finance',
                    'research',
                    'communications',
                    'construction',
                    'engineering',
                    'science',
                    'education',
                    'entertainment',
                    'environmental',
                    'government',
                    'healthcare',
                    'hospitality',
                    'human-resources',
                    'it',
                    'legal',
                    'logistics',
                    'manufacturing',
                    'marketing',
                    'media',
                    'military',
                    'retail',
                    'real-estate',
                    'sales',
                    'telecommunications',
                    'transportation',
                    'agriculture',
                    'animal-care',
                    'art',
                    'automotive',
                    'aviation',
                    'customer-service',
                    'facilities',
                    'insurance',
                    'oil-gas',
                    'production',
                    'quality-control',
                    'security',
                    'skilled-trades',
                    'social-service',
                    'sports',
                    ]
print(len(job_type_names))

43


In [8]:
def get_job_details(driver):
    """
    This function will scrape the job details from the job page: title, company, location, description
    @param driver: the selenium driver object
    @return: a tuple of the job details: title, company, location, description
    """
    h2_title, li_company, li_location, job_desc = '','','','' 
    soup_desc = BeautifulSoup(driver.page_source, 'html.parser')
    job_header = soup_desc.find('div',id='job-view-header')
    h2_title = job_header.find('h2', attrs={'data-testid':'jobTitle'}).text.strip()
    li_company = job_header.find('li', attrs={'data-testid':'company'}).text.strip()
    li_location = job_header.find('li', attrs={'data-testid':'jobDetailLocation'}).text.strip()
    print(f"Title: {h2_title}\nCompany: {li_company}\nLocation: {li_location}")
    job_desc = soup_desc.find('div', id='svx-job-view-description').text.strip()[11:]
    return h2_title, li_company, li_location, job_desc


In [9]:
def load_more_jobs(driver):
    """
    This function will scroll down the page to load more jobs from the website. 
    @param driver: the selenium driver object
    @return: True if more jobs are loaded successfully, False otherwise

    """
    try:
        scroll_div = driver.find_element(By.ID, 'card-scroll-container')
        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_div)
        # now we need to scroll up a little bit so more jobs will be loaded
        driver.execute_script("arguments[0].scrollTop = 0", scroll_div)
        time.sleep(2)
        scroll_div = driver.find_element(By.ID, 'card-scroll-container')

        driver.execute_script("arguments[0].scrollTop = arguments[0].scrollHeight", scroll_div)
        time.sleep(2)
    except Exception as e:
        print(f'Error while loading more jobs: {e}')
        return False
    return True

In [10]:
def click_by_attribute(driver, attribute, value):
    """
    This function will click on an element by its attribute and value
    @param driver: the selenium driver object
    @param attribute: the attribute of the element
    @param value: the value of the attribute
    @return: True if the element is clicked successfully, False otherwise
    """
    try:
        element = driver.find_element(By.XPATH, f"//button[@{attribute}='{value}']")
        driver.execute_script("arguments[0].click();", element)
    except Exception as e:
        print(e)
        return False
    return True


In [11]:
df = pd.DataFrame(columns=['title', 'company', 'location', 'description'])
def add_row(df, row):
    """
    This function will add a row to the dataframe
    @param df: the dataframe
    @param row: the row to be added
    @return: the updated dataframe
    """
    df.loc[len(df)] = row
    return df

In [12]:
# Just checking if the jobs folder exists, if not create it
if not os.path.exists('jobs'):
    os.makedirs('jobs')

In [13]:
# Non remote testing:
# save_period = 10
# for type in job_type_names[:1]:
#     try:
#         if os.path.exists(f'jobs/{type}_jobs.csv'):
#             print(f'file exists for {type}')
#             continue
#         # first phase - Load Page and get all jobs
#         print(f'Getting jobs for {type}')
#         add_url = f'q-{type}-jobs'
#         driver.get(f'{base_url}/{add_url}')
#         time.sleep(5) # captcha
#         load_more_jobs(driver)
#         time.sleep(1) # wait for jobs to load
#         soup = BeautifulSoup(driver.page_source, 'html.parser')
#         job_list_buttons = soup.find_all('button', attrs={'data-testid':'JobCardButton'})
#         print(f'found {len(job_list_buttons)} jobs')
#         # second phase - get job details
#         try:
#             for job_list in job_list_buttons:
#                 aria_label = job_list.get('aria-label') 
#                 click_by_attribute(driver, 'aria-label', aria_label)
#                 time.sleep(2)
#                 title, company, location, desc = get_job_details(driver)
#                 # third phase - add row and then save to file
#                 add_row(df, [title, company, location, desc])
#                 if len(df) % save_period == 0:
#                     df.to_csv(f'jobs/{type}_jobs.csv', index=False)
#                     print(f'saved {len(df)} jobs')
#         except Exception as e:
#             print(f'got error in job details: {e}')
#             continue
#     except Exception as e:
#         print(f'got error in job type {type}: {e}')
#         continue

### Setting up a logging in case something goes wrong and we crash

In [14]:
import logging

# Set up logging
# log_filename = "logging_stealth.txt"
log_filename = "./logs/monster_jobs.log"
logging.basicConfig(level=logging.INFO, 
                    format="%(message)s", 
                    handlers=[
                        logging.StreamHandler(),  # Print to console
                        logging.FileHandler(log_filename, mode="w")  # Save to file
                    ])
logging.info("Logging started")

Logging started


## Scraping using the remote feature of selenium
Important variables (username and password of bright data) will be loaded from a .env file

In [102]:
USER = os.getenv('USER')
PASS = os.getenv('PASS')

AUTH = f'{USER}:{PASS}'
SBR_WEBDRIVER = f'https://{AUTH}@brd.superproxy.io:9515'

sbr_connection = ChromiumRemoteConnection(SBR_WEBDRIVER, 'goog', 'chrome') 
options = ChromeOptions()
options.add_argument('--incognito')
options.add_argument('--start-maximized')
with Remote(sbr_connection, options=options) as driver:
    save_period = 10
    types_saved = 0
    for type in job_type_names:
        try:
            if types_saved >= 11:
                break # need to stop before bright data stops
            if os.path.exists(f'jobs/{type}_jobs.csv'):
                # print(f'file already exists for {type}')
                logging.info(f'file already exists for {type}')
                continue
            df = pd.DataFrame(columns=['title', 'company', 'location', 'description'])

            # first phase - Load Page and get all jobs
            # print(f'Getting jobs for {type}')
            logging.info(f'Getting jobs for {type}')
            add_url = f'q-{type}-jobs' + '&where='
            driver.get(f'{base_url}/{add_url}')
            time.sleep(5) # captcha
            load_more_jobs(driver)
            time.sleep(5) # wait for jobs to load
            soup = BeautifulSoup(driver.page_source, 'html.parser')
            job_list_buttons = soup.find_all('button', attrs={'data-testid':'JobCardButton'})
            # print(f'found {len(job_list_buttons)} jobs')
            logging.info(f'found {len(job_list_buttons)} jobs')
            # print(driver.current_url)
            logging.info(driver.current_url)
            # second phase - get job details
            try:
                i=0
                for job_list in job_list_buttons:
                    aria_label = job_list.get('aria-label') 
                    click_by_attribute(driver, 'aria-label', aria_label)
                    time.sleep(2)
                    # print(f'Job {i}')
                    logging.info(f'Job {i}')
                    i+=1
                    title, company, location, desc = get_job_details(driver)
                    # third phase - add row and then save to file
                    add_row(df, [title, company, location, desc])
                    if len(df) % save_period == 0 or len(df) == len(job_list_buttons):
                        df.to_csv(f'jobs/{type}_jobs.csv', index=False)
                        # print(f'saved {len(df)} jobs')
                        logging.info(f'saved {len(df)} jobs')
            except Exception as e:
                # print(f'got error in job details: {e}')
                logging.info(f'got error in job details: {e}')
                continue
            types_saved += 1
        except Exception as e:
            # print(f'got error in job type {type}: {e}')
            logging.info(f'got error in job type {type}: {e}')
            continue

file already exists for accounting
file already exists for administration
file already exists for banking
file already exists for finance
file already exists for research
file already exists for communications
file already exists for construction
file already exists for engineering
file already exists for science
file already exists for education
file already exists for entertainment
file already exists for environmental
file already exists for government
file already exists for healthcare
file already exists for hospitality
file already exists for human-resources
file already exists for it
file already exists for legal
file already exists for logistics
file already exists for manufacturing
file already exists for marketing
file already exists for media
file already exists for military
file already exists for retail
file already exists for real-estate
file already exists for sales
file already exists for telecommunications
file already exists for transportation
file already exists for 

## Post processing the data:

In [15]:
final_df = pd.DataFrame(columns=['type', 'title', 'company', 'location', 'description'])
for job_type in job_type_names:
    df = pd.read_csv(f'./jobs/{job_type}_jobs.csv')
    df['type'] = job_type
    final_df = pd.concat([final_df, df], ignore_index=True)

print(f'There are {final_df.shape[0]} jobs in the final dataframe')

There are 1207 jobs in the final dataframe


In [17]:
for col in final_df.columns:
    final_df[col] = final_df[col].str.replace('\n', ' ')\
                    .str.replace('\r', ' ')\
                    .str.replace('\t', ' ')\
                    .str.strip()
final_df.to_csv('monster_jobs.csv', index=False)