# Bumeran — Web Scraping

## 1. Installing libraries

Directly in the Anaconda Prompt, we need to create a new environment and install all the libraries needed to perform web scraping with Selenium. These libraries are contained in the requirements.txt file.

In [None]:
# requirements.txt

## Core scraping dependencies
### selenium==4.23.1
### webdriver-manager==4.0.2

## Parsing and data handling
### beautifulsoup4==4.12.3
### lxml==5.3.0
### pandas==2.2.2

## Environment variables
### python-dotenv==1.0.1

## Progress bars
### tqdm==4.66.5

## Interactive widgets for Jupyter
### ipywidgets==8.1.5

## Jupyter Notebook/Lab
### jupyter==1.0.0

## 2. Calling libraries

In [183]:
import pandas as pd
import re
import time 

# this library is to manipulate browser
from selenium import webdriver

# it allows you to work with differen versions of drivers
# We call ChromeDriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

## 3. Setting the Driver

In [184]:
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized") # Maximizar la ventana
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
# This code opens a Chrome Driver. We are going to use it to go navigate on the web.

url = "https://www.bumeran.com.pe/empleos.html"
driver.get(url)
time.sleep(12) 
# Abrir el website

driver.execute_script("document.body.style.zoom='100%'")
# Forzar a que la visualización esté al 100% y evitar errores al trabajar colaborativamente.

## 4. Applying filters

In [185]:
# 1. Fecha de publicación: Menor a 15 días
fecha_filter = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/button[1]/div")
fecha_filter.click()
time.sleep(1)
menos15 = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/div/button[8]")
menos15.click()
time.sleep(2)

# 2. Área: Tecnología, Sistemas y Telecomunicaciones
area_filter = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/button[1]/div")
area_filter.click()
time.sleep(1)
area_option = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/div/button[3]")
area_option.click()
time.sleep(2)

# 3. Subárea: Programación
subarea_filter = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/button[1]/div")
subarea_filter.click()
time.sleep(1)
subarea_option = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/div/button[2]")
subarea_option.click()
time.sleep(2)

# 4. Departamento: Lima
dep_filter = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/button[1]/div")
dep_filter.click()
time.sleep(1)
lima_option = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/div/button[1]")
lima_option.click()
time.sleep(2)

# 5. Carga horaria: Full-Time
carga_filter = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[1]/button[4]/div")
carga_filter.click()
time.sleep(1)
fulltime_option = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[1]/div[2]/div/div[2]/div/div/button[1]")
fulltime_option.click()
time.sleep(2)

## 5. Stage 1: Extracting Job Posting Links

In [186]:
wait = WebDriverWait(driver, 10)

job_links = []

while True:
    # Esperar a que carguen los títulos con links
    wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "a[href*='/empleos/']")))

    # Extraer todos los links de empleos en la página
    jobs = driver.find_elements(By.CSS_SELECTOR, "a[href*='/empleos/']")
    for job in jobs:
        link = job.get_attribute("href")
        if link and "bumeran.com.pe/empleos/" in link and link not in job_links:
            job_links.append(link)

    print(f"Links recolectados hasta ahora: {len(job_links)}")

    # Intentar ir a la siguiente página
    try:
        next_button = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[1]/div/div[2]/div[2]/div[22]/a[2]/i")
        time.sleep(1)
        next_button.click()
        time.sleep(3)
    except:
        print("No hay más páginas.")
        break

print(f"Total de links recolectados: {len(job_links)}")

Links recolectados hasta ahora: 20
Links recolectados hasta ahora: 40
Links recolectados hasta ahora: 60
Links recolectados hasta ahora: 80
Links recolectados hasta ahora: 100
Links recolectados hasta ahora: 109
No hay más páginas.
Total de links recolectados: 109


In [187]:
job_links

['https://www.bumeran.com.pe/empleos/software-engineer-senior-grupo-gloria-1117976663.html',
 'https://www.bumeran.com.pe/empleos/programador-frontend-senior-sonda-del-peru-s.a.-1117973000.html',
 'https://www.bumeran.com.pe/empleos/analista-programador-java-1117972678.html',
 'https://www.bumeran.com.pe/empleos/analista-programador-postgre-oracle-hibrido-green-solutions-1117979984.html',
 'https://www.bumeran.com.pe/empleos/analista-programador-sede-ate-bumeran-selecta-1117969140.html',
 'https://www.bumeran.com.pe/empleos/trainee-programador-jr-building-software-1117968407.html',
 'https://www.bumeran.com.pe/empleos/desarrollador-full-stack-node.js-c-java-angularjs-ibr-peru-s.a.-1117967655.html',
 'https://www.bumeran.com.pe/empleos/practicante-pre-profesional-de-automatizacion-canvia-1117980437.html',
 'https://www.bumeran.com.pe/empleos/analista-programador-fullstack-java-hitss-peru-1117980080.html',
 'https://www.bumeran.com.pe/empleos/desarrollador-genexus-remoto-software-enterpr

## 6. Stage 2: Scrape Job Details

In [188]:
data = []

for url in job_links:
    driver.get(url)
    time.sleep(3)

    try:
        job_title = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[2]/div/div[1]/div/div/div[1]/div/div[1]/h1").text
    except:
        job_title = ""

    try:
        # Extraer descripción hasta la sección "Beneficios"
        desc_element = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/p").text
        description = desc_element.strip().split("Beneficios")[0]
    except:
        description = ""

    try:
        district = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/div[1]/div[2]/div/div/li/a/h2").text
    except:
        district = ""

    try:
        work_mode = driver.find_element(By.XPATH, "/html/body/div[1]/div/div[2]/div[2]/div/div[2]/div[1]/div[2]/div[1]/div[1]/div[2]/div/div/div[4]/div/ul/div[1]/li[1]/a/p").text
    except:
        work_mode = ""

    data.append({
        "Job Title": job_title,
        "Description": description,
        "District": district,
        "Work Mode": work_mode
    })

## 7. Exporting CSV

In [177]:
df = pd.DataFrame(data)
df.to_csv("bumeran_data_science_jobs.csv", index=False, encoding="utf-8-sig")

In [None]:
driver.quit()