In [10]:
from bs4 import BeautifulSoup
import csv
import glob

def parse_job_offers(input_folder, output_csv):
    """
    Parses job listings from a folder with HTML files and writes the data to a CSV file.

    Arguments:
    input_folder (str): Path to the folder with raw data.
    output_csv (str): Path to the output CSV file.
    """
    html_files = glob.glob(f"{input_folder}*.html")
    data = []
    for html_file in html_files:
    
        # Load and parse HTML
        with open(html_file, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')

        
        # Search name information
        job = soup.find('input', {'name': 'kw'}).get('value')

        # Find Jobs Sections
        job_offers = soup.find_all('div', {'data-test': 'positioned-offer'})
        for offer in job_offers:
            # Extract data: job title
            title = offer.find('h2', {'data-test': 'offer-title'}).text.strip()
        
            # Company name
            company = offer.find('h3', {'data-test': 'text-company-name'}).text.strip()
        
            # Location
            location = offer.find('h4', {'data-test': 'text-region'}).text.strip()

            # Salary
            salary_find = offer.find('span', {'data-test': 'offer-salary'}) 
            if salary_find:
                salary = salary_find.text.strip()
            else:
                salary = ""
        
            # technologies
            technologies = [tech.text.strip() for tech in offer.find_all('span', {'data-test': 'technologies-item'})] 
        
            # Adding to data
            data.append([title, company, ', '.join(technologies), job, location, salary])  

    # Save to CSV
    with open(output_csv, 'w', newline='', encoding='utf-8') as f:
        writer = csv.writer(f)
        writer.writerow(['Title', 'Company', 'Technologies', 'Job', 'Location', 'Salary'])
        writer.writerows(data)

# Example of use:
parse_job_offers(r"../data/raw/", r"..\data\interim\job_offers.csv")


In [11]:
# Path to HTML file
file_path = r"..\data\raw\data analyst_1.html"

# Loading HTML file
with open(file_path, 'r', encoding='utf-8') as file:
    html_content = file.read()

soup1 = BeautifulSoup(html_content, 'html.parser')

technologies = [tech.text.strip() for tech in soup1.find_all('span', {'data-test': 'technologies-item'})]

jobs = [soup1.text.strip() for soup1 in soup1.find_all('div', {'data-test': 'positioned-offer'})]

salary = soup1.find('span', {'data-test': 'offer-salary'}).text.strip().replace('\xa0', '')

job = soup1.find('input', {'name': 'kw'})
job_value = job.get('value')