# Exercise 2: Bumeran — Web Scrapping

## 1. Setup

We already have created an env and installed requirements.txt

Now we call libraries:

In [21]:
# this library is to manipulate browser
from selenium import webdriver

# it allows you to work with differen versions of drivers
# We call ChromeDriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
import time 
from selenium.webdriver.common.by import By

## 2. Web Scrapping
### Stage 1: Extract Job Posting Links

We'll scrape all the job listing URLs based on the following filters:
  - Menor a 15 días
  - Tecnologías, Sistemas y Telecomunicaciones
  - Programación
  - Lima
  - Full-time

We start by launching the driver:

In [22]:
options = Options()
options.add_argument("--start-maximized")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

And set the driver url:

In [23]:
url = 'https://www.bumeran.com.pe/empleos.html'
driver.get(url)
time.sleep(3)

driver.maximize_window()
driver.execute_script("document.body.style.zoom='100%'")

In [24]:
print('Title: ', driver.title, '\nCurrent Page URL: ', driver.current_url)

Title:  Trabajos en Perú - Empleos Bumeran 2020 
Current Page URL:  https://www.bumeran.com.pe/empleos.html


Now, we apply filters by code:

In [25]:
steps = [
    # Fecha de publicación
    ("Open Fecha de publicación", "//button[contains(.,'Fecha de publicación')]"),
    ("Select Menor a 15 días", "//button[contains(.,'Menor a 15 días')]"),

    # Área
    ("Open menú Área", "//button[contains(.,'Área')]"),
    ("Select Tecnologías, Sistemas y Telecomunicaciones", "//button[contains(.,'Tecnología, Sistemas y Telecomunicaciones')]"),

    # Subárea
    ("Open Subárea", "//button[contains(.,'Subárea')]"),
    ("Select Programación", "//button[contains(.,'Programación')]"),

    # Departamento
    ("Open Departamento", "//button[contains(.,'Departamento')]"),
    ("Select Lima", "//button[contains(.,'Lima')]"),

    # Carga horaria
    ("Open Carga horaria", "//button[contains(.,'Carga horaria')]"),
    ("Select Full-time", "//button[contains(.,'Full-time')]")
]

for desc, xpath in steps:
    btn = driver.find_element(By.XPATH, xpath)
    driver.execute_script("arguments[0].click();", btn)
    time.sleep(2)

print("Filters applied.")

filtered_url = driver.current_url

Filters applied.


After that, we can scrape the page to get all urls:

In [26]:
all_job_links = []
unique_job_links = []
base_url = "https://www.bumeran.com.pe"
page = 1

while True:
    job_posts = driver.find_elements(
        By.XPATH,
        "//div[@id='listado-avisos']//a[contains(@href,'/empleos/')]"
    )
    for post in job_posts:
        href = post.get_attribute("href")
        if href.startswith("/"):
            href = base_url + href

        # guardar todos
        all_job_links.append(href)

        # guardar únicos
        if href not in unique_job_links:
            unique_job_links.append(href)

    print(f"Page {page} scraped. Total so far: {len(all_job_links)} (todos), {len(unique_job_links)} (únicos)")

    # pasar a la siguiente página
    page += 1
    try:
        next_page = driver.find_element(By.XPATH, f"//a[span[text()='{page}']]")
        driver.execute_script("arguments[0].click();", next_page)
        time.sleep(3)
    except Exception:
        print("Last page reached.")
        break

print(f"Total links (con duplicados): {len(all_job_links)}")
print(f"Total unique links: {len(unique_job_links)}")


Page 1 scraped. Total so far: 20 (todos), 20 (únicos)
Page 2 scraped. Total so far: 40 (todos), 20 (únicos)
Page 3 scraped. Total so far: 60 (todos), 40 (únicos)
Page 4 scraped. Total so far: 80 (todos), 40 (únicos)
Page 5 scraped. Total so far: 100 (todos), 60 (únicos)
Last page reached.
Total links (con duplicados): 100
Total unique links: 60


### Stage 2: Scrape Job Details

For each job URL collected in Stage 1, extract the following:
  - Job Title
  - Description (up to the "Benefits" section)
  - District
  - Work Mode (e.g., on-site, remote, hybrid)

We start by launching the driver: