### Pobiernaie Danych

In [None]:
import time
import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.common.exceptions import NoSuchWindowException

# Sleep time
WAIT_TIME = 10
OUTPUT_DIRECTORY = 'data1/raw'

# Połaczenie ze stroną noflufjobs
def get_page_html(job_name, page_number):
    try:
        url = f'https://nofluffjobs.com/pl/jobs?criteria={job_name}&page={page_number}'
        driver = webdriver.Chrome()
        driver.get(url)

        time.sleep(WAIT_TIME)

        page_html = driver.page_source
        driver.quit()
        return page_html
    except NoSuchWindowException:
        print("Okno przeglądarki zostało zamknięte.")
        return None

# Sprawdzenie stron
def has_more_jobs_in_page(page_html):
    soup = BeautifulSoup(page_html, 'html.parser')
    next_page_link = soup.find('a', class_='page-link', text='»')
    return next_page_link is not None

# Zapis strony
def save_page_to_file(job_name, page_number, page_html):
    filename = os.path.join(OUTPUT_DIRECTORY, f'{job_name}_{page_number}.html')
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(page_html)

# Prasowanie nazwy pliku
def extract_job_names(page_html):
    soup = BeautifulSoup(page_html, 'html.parser')
    job_names = []

    job_name_elements = soup.find_all('h3', class_='posting-title__position')
    for element in job_name_elements:
        job_name = element.text.strip()
        job_names.append(job_name)

    return job_names

# Wtświetlenie ofert
def display_job_info(job_name, page_number, job_names_on_page):
    print(f"Stanowisko: {job_name}")
    print(f"Numer strony: {page_number}")
    print("Oferty na stronie:")
    for idx, job_name in enumerate(job_names_on_page, start=1):
        print(f"{idx}. {job_name}")
    print("\n")

# Wybór ofert
def scrape_jobs():
    job_names = input("Podaj oferty pracy (oddzielone przecinkami): ").split(',')
    for job_name in job_names:
        page_number = 1
        while True:
            page_html = get_page_html(job_name, page_number)
            if page_html is None:
                break
            save_page_to_file(job_name, page_number, page_html)
            job_names_on_page = extract_job_names(page_html)
            if not job_names_on_page:
                print(f"Brak ofert dla stanowiska: {job_name}. Przechodzę do następnego stanowiska.")
                break
            display_job_info(job_name, page_number, job_names_on_page)
            page_number += 1
            time.sleep(5)

if __name__ == "__main__":
    scrape_jobs()


### Scrapowanie danych - Plik

In [89]:
import os
from bs4 import BeautifulSoup
import csv
from datetime import date

# Funkcja pobierająca dane z kodu HTML (pobrane oferty) 
def extract_job_info(html_code, file_name):
    soup = BeautifulSoup(html_code, 'html.parser')
    job_blocks = soup.find_all('a', class_='posting-list-item')

    job_info_list = []
    for job_block in job_blocks:
        job_info = {}
        
        # Dodajemy nazwę pliku bez rozszerzenia
        job_info['file_name'] = file_name

        # Nazwa stanowiksa
        job_info['position_name'] = job_block.find('h3', class_='posting-title__position').text.strip()
        
        # Nazwa firmy
        company_tag = job_block.find('h4', class_='tw-text-gray-60 company-name tw-w-[50%] lg:tw-w-auto tw-mb-0 !tw-text-xs !lg:tw-text-sm tw-font-semibold lg:tw-font-normal')
        if company_tag and company_tag.text.strip():
            job_info['company'] = company_tag.text.strip()
        else:
            job_info['company'] = 'Nie znaleziono nazwy firmy'

        # Informacje oferty  
        offer_technology_info = job_block.find('span', class_='lg:tw-text-gray-60 lg:tw-border-2 lg:tw-border-gray-ddd tw-text-xs tw-lowercase lg:tw-py-0.5 lg:tw-px-2 tw-text-gray-60')
        if offer_technology_info and offer_technology_info.text.strip():
            job_info['job_information'] = offer_technology_info.text.strip()
        else:
            job_info['job_information'] = 'No Data'

        # Widełki (wynagrodzenie)
        salary = job_block.find('span', class_='text-truncate badgy salary lg:tw-btn tw-text-ink lg:tw-btn-secondary-outline tw-text-xs lg:tw-py-0.5 lg:tw-px-2 ng-star-inserted')
        if salary and salary.text.strip():
            low, high = parse_salary_info(salary.text.strip())
            job_info['salary_low'] = low
            job_info['salary_high'] = high
            job_info['currency'] = 'PLN'
        else:
            job_info['salary_low'] = 'N/A'
            job_info['salary_high'] = 'N/A'
            job_info['currency'] = 'N/A'

        # Lokalizacja
        location_info = job_block.find('span', class_='tw-text-ellipsis tw-inline-block tw-overflow-hidden tw-whitespace-nowrap tw-max-w-[100px] md:tw-max-w-[200px] tw-text-right')
        if location_info and location_info.text.strip():
            job_info['location'] = location_info.text.strip()
        else:
            job_info['location'] = 'No Data'

        job_info_list.append(job_info)

    return job_info_list

# Funkcja parsująca informacje o wynagrodzeniu
def parse_salary_info(salary):
    cleaned_salary_info = ''.join(salary.split())
    parts = cleaned_salary_info.split("–")
    if len(parts) == 2:
        low = int(''.join(filter(str.isdigit, parts[0])))
        high = int(''.join(filter(str.isdigit, parts[1])))
        return low, high
    else:
        return 'N/A', 'N/A'

def main():
    # Ścieżka przechowująca pliki html
    raw_data_directory = 'data1/raw/'

    job_data = []  

    for filename in os.listdir(raw_data_directory):
        if filename.endswith('.html'):
            filepath = os.path.join(raw_data_directory, filename)
            
            
            file_name = os.path.splitext(filename)[0]
            
            with open(filepath, 'r', encoding='utf-8') as file:
                html_code = file.read()

            job_info_list = extract_job_info(html_code, file_name)

            if job_info_list:
                job_data.extend(job_info_list)

    if job_data:
        # Ścieżka zapisu pliku
        today = date.today()
        output_directory = 'interim'
        if not os.path.exists(output_directory):
            os.makedirs(output_directory)
        output_file = os.path.join(output_directory, f'job_data_{today}.csv')

        # Zapis do pliku CSV 
        with open(output_file, 'w', newline='', encoding='utf-8') as csv_file:
            csv_writer = csv.DictWriter(csv_file, fieldnames=job_data[0].keys())
            csv_writer.writeheader()
            csv_writer.writerows(job_data)

        print(f"Dane zapisano do pliku: {output_file}")
    else:
        print("Brak ofert pracy.")

if __name__ == "__main__":
    main()


Dane zapisano do pliku: interim\job_data_2024-02-04.csv


### Transformacja danych 

In [87]:
import pandas as pd

def clean_and_save_csv(input_file, output_dir):

    df = pd.read_csv(input_file)
   
 
    # Wyliczenie średniej pensji stanowiska
    if 'salary_high' in df.columns and 'salary_low' in df.columns:
        df['salary_avg'] = (df['salary_high'] + df['salary_low']) / 2
    else:
        df['salary_avg'] = 'N/A'
        
    # Poziom Stanowiska
    df['is_senior'] = df['position_name'].str.contains('senior', case=False).astype(int)
    df['is_lead'] = df['position_name'].str.contains('lead', case=False).astype(int)
    df['is_mid'] = df['position_name'].str.contains('mid', case=False).astype(int)
    df['is_junior'] = df['position_name'].str.contains('junior', case=False).astype(int)
    
    # Zapis pliku z datą 
    today = pd.to_datetime('today').strftime('%Y-%m-%d')
    output_file = f'{output_dir}/job_offers_{today}.csv'
    df.to_csv(output_file, sep=';', encoding='utf-8', index=False)
    print(f"Przetworzone dane zapisano do pliku: {output_file}")

if __name__ == "__main__":
    input_csv_file = 'interim/job_data_2024-02-04.csv'
    output_directory = 'processed'

    clean_and_save_csv(input_csv_file, output_directory)


Przetworzone dane zapisano do pliku: processed/job_offers_2024-02-04.csv


Dane zapisano do pliku: interim\job_data_2024-02-04.csv
