In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.common.exceptions import TimeoutException, WebDriverException
import os


In [2]:
# Get the Wuzzuf sitemap URL
sitemap_url = "https://wuzzuf.net/sitemap-job-1.xml"
response = requests.get(sitemap_url)

# Parse the XML
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "xml")
    urls = [loc.text for loc in soup.find_all("loc")]
else:
    print("Failed to fetch the sitemap.")
    exit()

In [3]:
options = Options()
options.headless = True 
driver = webdriver.Firefox(options=options)

In [4]:
def get_element_text(by, value, index=0):
    try:
        elements = driver.find_elements(by, value)
        return elements[index].text.strip() if elements else "N/A"
    except:
        return "N/A"

In [5]:
data = []
failed_urls = []
batch_size = 500  

for i, url in enumerate(urls[4000:], start=4000):
    try:
        driver.get(url)
        sleep(7)  

        job_data = {
            "Title": get_element_text(By.XPATH, "//section/div/h1"),
            "Company": get_element_text(By.XPATH, "//section/div/div/div/a/span"),
            "Working Place": get_element_text(By.XPATH, "//section/div/div/a/span"),
            "Location": get_element_text(By.XPATH, "//strong/div/a"),
            "Post Date": get_element_text(By.XPATH, "//section/div/span"),
            "Number of Positions": get_element_text(By.XPATH, "//section/div/div/div/strong"),
            "Applicants": get_element_text(By.XPATH, "//div/div/span/span", index=1),
            "Career Level": get_element_text(By.XPATH, "//main//section/div/span/span"),
            "Education": get_element_text(By.XPATH, "//main//section/div/span/span", index=1),
            "Salary": get_element_text(By.XPATH, "//main//section/div/span/span", index=2),
            "Job Category": get_element_text(By.XPATH, "//main//section/div/span/span", index=3),
            "Skills": get_element_text(By.XPATH, "//div/ul/li/a/span"),
            "Job Description": " ".join([i.text for i in driver.find_elements(By.XPATH, "//div/a/span/span/span")]),
        }

        data.append(job_data)

        if len(data) >= batch_size:
            df = pd.DataFrame(data)
            df.to_csv("wuzzuf_jobs.csv", mode='a', index=False, header=not os.path.exists("wuzzuf_jobs.csv"))
            data.clear()
            print(f"Saved {i} jobs to CSV.")

        # Restart WebDriver every 2000 jobs to free memory
        if i % 2000 == 0:
            driver.quit()
            driver = webdriver.Firefox(options=options)
            print("Restarting WebDriver to free up memory.")

    except (TimeoutException, WebDriverException) as e:
        print(f"Error on {url}: {e}")
        failed_urls.append(url)
        continue  # Skip to next job if there's an error

# Final save for any remaining data
if data:
    df = pd.DataFrame(data)
    df.to_csv("wuzzuf_jobs.csv", mode='a', index=False, header=not os.path.exists("wuzzuf_jobs.csv"))
    print("Final batch saved.")

driver.quit()


Restarting WebDriver to free up memory.
Saved 4499 jobs to CSV.
Saved 4999 jobs to CSV.
Saved 5499 jobs to CSV.
Saved 5999 jobs to CSV.
Restarting WebDriver to free up memory.
Saved 6499 jobs to CSV.
Saved 6999 jobs to CSV.
Saved 7499 jobs to CSV.
Saved 7999 jobs to CSV.
Restarting WebDriver to free up memory.
Saved 8499 jobs to CSV.
Final batch saved.
