In [12]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import lxml.html as lh
from time import sleep
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
import json
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from selenium.common.exceptions import TimeoutException
import os
import numpy as np

In [2]:
# Get the Wuzzuf sitemap URL
sitemap_url = "https://wuzzuf.net/sitemap-job-1.xml"
response = requests.get(sitemap_url)

# Parse the XML
if response.status_code == 200:
    soup = BeautifulSoup(response.content, "xml")
    urls = [loc.text for loc in soup.find_all("loc")]
        
else:
    print("Failed to fetch the sitemap.")

In [3]:
len(urls)

8571

In [13]:
title = []
working_hours = []
working_place = []
company = []
location = []
post_date = []
num_of_applicants = []
num_of_positions = []
experience = []
career_level = []
education = []
salary = []
job_category = []
skills = []

In [14]:
driver = webdriver.Firefox()

In [15]:
# Loop through URLs and collect data
for url in urls[:5]:
    try:
        driver.get(url)
        
        # Wait for the page to load by waiting for a specific element to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.XPATH, "//section/div/h1")))
        
        # Append data to lists with individual error handling
        try:
            title.append(driver.find_elements(By.XPATH, "//section/div/h1")[0].text)
        except Exception as e:
            print(f"Error extracting title from {url}: {str(e)}")
            title.append(None)
        
        try:
            working_hours.append(driver.find_elements(By.XPATH, "//section/div/div/div/a/span")[0].text)
        except Exception as e:
            print(f"Error extracting working hours from {url}: {str(e)}")
            working_hours.append(None)
        
        try:
            working_place.append(driver.find_elements(By.XPATH, "//section/div/div/a/span")[0].text)
        except Exception as e:
            print(f"Error extracting working place from {url}: {str(e)}")
            working_place.append(None)
        
        try:
            company.append(driver.find_elements(By.XPATH, "//strong/div/a")[1].text)
        except Exception as e:
            print(f"Error extracting company from {url}: {str(e)}")
            company.append(None)
        
        try:
            location.append(driver.find_elements(By.XPATH, "//strong")[1].text.split('-')[-1].strip())
        except Exception as e:
            print(f"Error extracting location from {url}: {str(e)}")
            location.append(None)
        
        try:
            post_date.append(driver.find_elements(By.XPATH, "//section/div/span")[0].text)
        except Exception as e:
            print(f"Error extracting post date from {url}: {str(e)}")
            post_date.append(None)
        
        try:
            num_of_applicants.append(driver.find_elements(By.XPATH, "//section/div/div/div/strong")[0].text)
        except Exception as e:
            print(f"Error extracting number of applicants from {url}: {str(e)}")
            num_of_applicants.append(None)
        
        try:
            num_of_positions.append(driver.find_elements(By.XPATH, "//div/div/span/span")[1].text)
        except Exception as e:
            print(f"Error extracting number of positions from {url}: {str(e)}")
            num_of_positions.append(None)
        
        try:
            experience.append(driver.find_elements(By.XPATH, "//main//section/div/span/span")[0].text)
        except Exception as e:
            print(f"Error extracting experience from {url}: {str(e)}")
            experience.append(None)
        
        try:
            career_level.append(driver.find_elements(By.XPATH, "//main//section/div/span/span")[1].text)
        except Exception as e:
            print(f"Error extracting career level from {url}: {str(e)}")
            career_level.append(None)
        
        try:
            education.append(driver.find_elements(By.XPATH, "//main//section/div/span/span")[2].text)
        except Exception as e:
            print(f"Error extracting education from {url}: {str(e)}")
            education.append(None)
        
        try:
            salary.append(driver.find_elements(By.XPATH, "//main//section/div/span/span")[3].text)
        except Exception as e:
            print(f"Error extracting salary from {url}: {str(e)}")
            salary.append(None)
        
        try:
            job_category.append(driver.find_elements(By.XPATH, "//div/ul/li/a/span")[0].text)
        except Exception as e:
            print(f"Error extracting job category from {url}: {str(e)}")
            job_category.append(None)
        
        try:
            skills.append([i.text for i in driver.find_elements(By.XPATH, "//div/a/span/span/span")])
        except Exception as e:
            print(f"Error extracting skills from {url}: {str(e)}")
            skills.append(None)
        
    except Exception as e:
        print(f"Error processing URL {url}: {str(e)}")

# Create DataFrame
jobs_df = pd.DataFrame({
    'Title': title,
    'Working_Hours': working_hours,
    'Working_Place': working_place,
    'Company': company,
    'Location': location,
    'Post_Date': post_date,
    'Number_of_Applicants': num_of_applicants,
    'Number_of_Positions': num_of_positions,
    'Experience': experience,
    'Career_Level': career_level,
    'Education': education,
    'Salary': salary,
    'Job_Category': job_category,
    'Skills': skills
})

# Close the browser
driver.quit()

In [17]:
jobs_df.to_csv("jobs.csv", index=False)