# Exercise 2: Bumeran — Web Scrapping

## 1. Setup

We already have created an env and installed requirements.txt

Now we call libraries:

In [2]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import re
import time 
from selenium.webdriver.common.by import By

## 2. Web Scrapping
### Stage 1: Extract Job Posting Links

We'll scrape all the job listing URLs based on the following filters:
  - Menor a 15 días
  - Tecnologías, Sistemas y Telecomunicaciones
  - Programación
  - Lima
  - Full-time

We start by launching the driver:

In [16]:
options = Options()
options.add_argument("--start-maximized")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=options)

And set the driver url:

In [17]:
url = 'https://www.bumeran.com.pe/empleos.html'
driver.get(url)
time.sleep(3)

driver.maximize_window()
driver.execute_script("document.body.style.zoom='100%'")

In [None]:
print('Title: ', driver.title, '\nCurrent Page URL: ', driver.current_url)

Title:  Trabajos en Perú - Empleos Bumeran 2020 
Current Page URL:  https://www.bumeran.com.pe/empleos.html


Now, we apply filters by code:

In [18]:
steps = [
    # Fecha de publicación
    ("Open Fecha de publicación", "//button[contains(.,'Fecha de publicación')]"),
    ("Select Menor a 15 días", "//button[contains(.,'Menor a 15 días')]"),

    # Área
    ("Open menú Área", "//button[contains(.,'Área')]"),
    ("Select Tecnologías, Sistemas y Telecomunicaciones", "//button[contains(.,'Tecnología, Sistemas y Telecomunicaciones')]"),

    # Subárea
    ("Open Subárea", "//button[contains(.,'Subárea')]"),
    ("Select Programación", "//button[contains(.,'Programación')]"),

    # Departamento
    ("Open Departamento", "//button[contains(.,'Departamento')]"),
    ("Select Lima", "//button[contains(.,'Lima')]"),

    # Carga horaria
    ("Open Carga horaria", "//button[contains(.,'Carga horaria')]"),
    ("Select Full-time", "//button[contains(.,'Full-time')]")
]

for desc, xpath in steps:
    btn = driver.find_element(By.XPATH, xpath)
    driver.execute_script("arguments[0].click();", btn)
    time.sleep(2)

print("Filters applied.")

filtered_url = driver.current_url

Filters applied.


After that, we can scrape the page to get all urls:

In [None]:
all_job_links = []
unique_job_links = []
base_url = "https://www.bumeran.com.pe"
page = 1

while True:
    job_posts = driver.find_elements(
        By.XPATH,
        "//div[@id='listado-avisos']//a[contains(@href,'/empleos/')]"
    )
    for post in job_posts:
        href = post.get_attribute("href")
        if href.startswith("/"):
            href = base_url + href

        # guardar todos
        all_job_links.append(href)

        # guardar únicos
        if href not in unique_job_links:
            unique_job_links.append(href)

    print(f"Page {page} scraped. Total so far: {len(all_job_links)} (todos), {len(unique_job_links)} (únicos)")

    # pasar a la siguiente página
    page += 1
    try:
        next_page = driver.find_element(By.XPATH, f"//a[span[text()='{page}']]")
        driver.execute_script("arguments[0].click();", next_page)
        time.sleep(3)
    except Exception:
        print("Last page reached.")
        break

print(f"Total links (con duplicados): {len(all_job_links)}")
print(f"Total unique links: {len(unique_job_links)}")


Page 1 scraped. Total so far: 20 (todos), 20 (únicos)
Page 2 scraped. Total so far: 40 (todos), 40 (únicos)
Page 3 scraped. Total so far: 60 (todos), 60 (únicos)
Page 4 scraped. Total so far: 80 (todos), 80 (únicos)
Page 5 scraped. Total so far: 100 (todos), 100 (únicos)
Page 6 scraped. Total so far: 110 (todos), 110 (únicos)
Last page reached.
Total links (con duplicados): 110
Total unique links: 110


### Stage 2: Scrape Job Details

For each job URL collected in Stage 1, extract the following:
  - Job Title
  - Description (up to the "Benefits" section)
  - District
  - Work Mode (e.g., on-site, remote, hybrid)

We start by launching the driver:

In [None]:
# ===== Stage 2 (robust): extract Job Title, Description, District, Work Mode =====
# Reuses: driver, unique_job_links from Stage 1

import os, re, csv, time, json, html, unicodedata
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, WebDriverException

output_dir = "./output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

OUTPUT_CSV = os.path.join(os.getcwd(), "output", "output_stage2.csv")
WAIT_ELEM = 9
PAGELOAD_TIMEOUT = 7.5
SCRIPT_TIMEOUT = 5
PAUSE = 0.2

DISTRICTS = [
    "Lima","Callao","Miraflores","San Isidro","Surco","La Molina","San Borja",
    "Barranco","Chorrillos","La Victoria","San Miguel","Pueblo Libre","Magdalena",
    "Jesús María","Jesus Maria","Magdalena del Mar","Santiago de Surco","Ate","Lurín","Lurin","Cercado de Lima"
]

# timeouts (ignore if not supported)
try:
    driver.set_page_load_timeout(PAGELOAD_TIMEOUT)
    driver.set_script_timeout(SCRIPT_TIMEOUT)
except Exception:
    pass

def norm(s: str) -> str:
    if not s: return ""
    return re.sub(r"\s+", " ", s, flags=re.M).strip()

def strip_accents(s: str) -> str:
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def strip_html(s: str) -> str:
    if not s: return ""
    s = html.unescape(s)
    s = re.sub(r"<br\s*/?>", "\n", s, flags=re.I)
    s = re.sub(r"</p\s*>", "\n", s, flags=re.I)
    s = re.sub(r"<.*?>", "", s)  # remove tags
    return norm(s)

def safe_get(url: str):
    try:
        driver.get(url)
    except TimeoutException:
        try:
            driver.execute_script("window.stop();")
        except WebDriverException:
            pass

def wait_job_loaded(timeout=WAIT_ELEM):
    xp = ("//h1 | //script[@type='application/ld+json']"
          " | //div[contains(@id,'jobDescription') or contains(@class,'job-description') or @data-qa='job-description']")
    WebDriverWait(driver, timeout).until(EC.presence_of_element_located((By.XPATH, xp)))

def is_valid_job_url(u: str) -> bool:
    return bool(re.search(r"https?://(www\.)?bumeran\.com\.pe/empleos/.+?-([0-9]{6,})\.html", u, re.I))

# ---------------- JSON-LD first, DOM second ----------------

def parse_jsonld():
    """Return dict with any of: title, description, district, mode from JobPosting JSON-LD, else {}."""
    out = {}
    scripts = driver.find_elements(By.XPATH, "//script[@type='application/ld+json']")
    for sc in scripts:
        raw = sc.get_attribute("textContent") or ""
        raw = raw.strip()
        if not raw:
            continue
        # some sites wrap multiple JSON objects or arrays
        try:
            data = json.loads(raw)
        except Exception:
            # try to salvage by fixing trailing commas etc. If it fails, skip.
            continue

        candidates = []
        if isinstance(data, list):
            candidates = data
        else:
            candidates = [data]

        for obj in candidates:
            # Some pages nest under @graph
            if isinstance(obj, dict) and "@graph" in obj and isinstance(obj["@graph"], list):
                candidates.extend(obj["@graph"])
                continue

            if not isinstance(obj, dict): 
                continue
            t = obj.get("@type") or obj.get("type")
            if isinstance(t, list):
                t = next((x for x in t if isinstance(x,str) and x.lower()=="jobposting"), t[0] if t else None)
            if not isinstance(t, str) or "jobposting" not in t.lower():
                continue

            # title
            title = obj.get("title") or obj.get("name")
            if title: out.setdefault("title", norm(title))

            # description (may be HTML)
            desc = obj.get("description")
            if desc:
                desc = strip_html(desc)
                # cut at Beneficios with/without accent
                lo = strip_accents(desc.lower())
                m = re.search(r"\bbeneficios?\b", lo)
                if m: desc = desc[:m.start()].rstrip()
                out.setdefault("description", desc[:10000])

            # work mode: remote/hybrid/on site
            # schema fields may include jobLocationType, employmentType, or "applicantLocationRequirements"
            jltype = (obj.get("jobLocationType") or "")
            etype  = obj.get("employmentType")
            text_pool = " ".join([str(jltype), str(etype or "")])
            if re.search(r"remote|telecommute|home", text_pool, re.I):
                out.setdefault("mode", "Remoto")
            # hybrid often not explicit; leave to DOM if not remote/presencial

            # district: jobLocation.address.addressLocality or addressRegion
            jl = obj.get("jobLocation")
            if isinstance(jl, list):
                jl = jl[0] if jl else None
            if isinstance(jl, dict):
                addr = jl.get("address") or {}
                locality = addr.get("addressLocality") or addr.get("addressRegion") or addr.get("streetAddress")
                if locality:
                    out.setdefault("district", norm(locality))

            # If we already have description and district, good enough
    return out

def dom_title():
    for xp in ["//h1", "//header//h1", "//h1[contains(@class,'title') or contains(@class,'job')]"]:
        try:
            el = driver.find_element(By.XPATH, xp)
            t = norm(el.text)
            if t: return t
        except Exception:
            pass
    return None

def dom_description():
    for xp in [
        "//section[contains(@class,'job-description')]",
        "//div[@data-qa='job-description']",
        "//div[contains(@id,'jobDescription') or contains(@class,'job-description')]",
        "//article[contains(@class,'job-description')]",
        # broader fallbacks
        "//main//section[contains(.,'Requis') or contains(.,'Respons')]",
        "//main//div[contains(.,'Requis') or contains(.,'Respons')]",
        "//article"
    ]:
        try:
            el = driver.find_element(By.XPATH, xp)
            txt = norm(el.text)
            if not txt: 
                continue
            lo = strip_accents(txt.lower())
            m = re.search(r"\bbeneficios?\b", lo)
            if m: txt = txt[:m.start()].rstrip()
            return txt[:10000]
        except Exception:
            continue
    return None

def dom_blocks():
    blocks = []
    for xp in [
        "//*[contains(.,'Lugar de trabajo') or contains(.,'Ubicación')]/..",
        "//li[contains(.,'Lugar de trabajo') or contains(.,'Ubicación')]",
        "//*[contains(@data-qa,'job-location') or contains(@class,'job-location')]",
        "//section[contains(@class,'job-details') or contains(@class,'metadata') or contains(@class,'job-info')]",
        "//dl | //ul"
    ]:
        try:
            els = driver.find_elements(By.XPATH, xp)
            for el in els:
                t = norm(el.text)
                if t:
                    blocks.append(t)
        except Exception:
            pass
    return blocks

def dom_location_and_mode():
    blocks = dom_blocks()
    district, best = None, 10**9
    for t in blocks:
        for hint in DISTRICTS:
            if re.search(rf"\b{re.escape(hint)}\b", t, re.I):
                parts = [p.strip() for p in re.split(r"[•|,;/\-]\s*", t) if p.strip()]
                for p in parts:
                    if re.search(rf"\b{re.escape(hint)}\b", p, re.I) and len(p) < best:
                        district, best = p, len(p)

    joined = " | ".join(blocks)
    mode = None
    if re.search(r"\b(remoto|teletrabajo)\b", joined, re.I):
        mode = "Remoto"
    elif re.search(r"\b(h[ií]brido|mixto|semi ?presencial)\b", joined, re.I):
        mode = "Híbrido"
    elif re.search(r"\b(presencial|on ?site)\b", joined, re.I):
        mode = "Presencial"

    return district, mode

def scrape_one(url: str):
    safe_get(url)
    try:
        wait_job_loaded()
    except TimeoutException:
        return None

    data = parse_jsonld()

    title = data.get("title") or dom_title()
    desc  = data.get("description") or dom_description()
    district = data.get("district")
    mode = data.get("mode")

    if not district or not mode:
        d2, m2 = dom_location_and_mode()
        district = district or d2
        mode = mode or m2

    # sanitize district further: keep the shortest district-like token
    if district:
        tokens = [p.strip() for p in re.split(r"[•|,;/\-]\s*", district) if p.strip()]
        for tk in sorted(tokens, key=len):
            for hint in DISTRICTS:
                if re.search(rf"\b{re.escape(hint)}\b", tk, re.I):
                    district = tk
                    break
            if district == tk:
                break

    return {
        "Job Title": title or None,
        "Description": desc or None,
        "District": district or None,
        "Work Mode": mode or None,
        "url": url
    }

# ---------- run Stage 2 on your Stage-1 links ----------
try:
    LINKS = list(dict.fromkeys(unique_job_links))
except NameError:
    LINKS = []

VALID = [u for u in LINKS if is_valid_job_url(u)]
rows, fails = [], 0

for i, url in enumerate(VALID, 1):
    print(f"[{i}/{len(VALID)}] {url}")
    try:
        row = scrape_one(url)
        if row:
            rows.append(row)
        else:
            fails += 1
    except Exception as e:
        print("skip:", e)
        fails += 1
    time.sleep(PAUSE)

with open(OUTPUT_CSV, "w", encoding="utf-8-sig", newline="") as f:
    w = csv.DictWriter(f, fieldnames=["Job Title","Description","District","Work Mode","url"])
    w.writeheader()
    for r in rows:
        w.writerow({k: r.get(k) if r.get(k) is not None else "" for k in w.fieldnames})

print(f"\nStage 2 done. Rows OK: {len(rows)} | Failed: {fails} | CSV: {OUTPUT_CSV}")

[1/110] https://www.bumeran.com.pe/empleos/software-engineer-senior-grupo-gloria-1117976663.html
[2/110] https://www.bumeran.com.pe/empleos/programador-frontend-senior-sonda-del-peru-s.a.-1117973000.html
[3/110] https://www.bumeran.com.pe/empleos/analista-programador-java-1117972678.html
[4/110] https://www.bumeran.com.pe/empleos/analista-programador-sede-ate-bumeran-selecta-1117969140.html
[5/110] https://www.bumeran.com.pe/empleos/trainee-programador-jr-building-software-1117968407.html
[6/110] https://www.bumeran.com.pe/empleos/desarrollador-full-stack-node.js-c-java-angularjs-ibr-peru-s.a.-1117967655.html
[7/110] https://www.bumeran.com.pe/empleos/java-developer-senior-inetum-peru-1117979249.html
[8/110] https://www.bumeran.com.pe/empleos/senior-react-developer-encora-1117979242.html
[9/110] https://www.bumeran.com.pe/empleos/desarrollador-backend-semi-senior-metrica-peru-1117979121.html
[10/110] https://www.bumeran.com.pe/empleos/desarrollador-lider-tecnico-protiviti-peru-11179789

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\ADMIN\\Desktop\\DIPLOMADO CIENCIA DE DATOS PARA LAS CIENCIAS SOCIALES Y LA GESTION PUBLICA\\PYTHON\\INTERMEDIO\\web_scrapping\\code\\./output/output_stage2.csv'

In [21]:
import pandas as pd
df = pd.read_csv("output_stage2.csv")

print("Rows:", len(df))
print("Missing Description:", df['Description'].isna().sum())
print("Missing District:", df['District'].isna().sum())
print("Work modes:", df['Work Mode'].dropna().unique().tolist())

print("\nSample:")
display(df.sample(5, random_state=0)[['Job Title','District','Work Mode']])

# sanity: descriptions shouldn’t contain menu junk
bad_desc = df['Description'].fillna('').str.contains(r'Buscar empleo|Trabajos en Lima', case=False)
print("\nDescriptions with menu noise:", bad_desc.sum())

Rows: 110
Missing Description: 0
Missing District: 0
Work modes: ['Presencial', 'Remoto', 'Híbrido']

Sample:


Unnamed: 0,Job Title,District,Work Mode
85,Desarrollador .Net / PHP,Lima,Remoto
10,Desarrollador de integración OSB,Lima,Remoto
75,Analista programador,Jesús María,Presencial
2,Analista Programador Java,Lima,Presencial
24,Analista Programador JAVA,Lima,Híbrido



Descriptions with menu noise: 0
