In [1]:
#importujemy biblioteki

import pandas as pd
import re
import os

from bs4 import BeautifulSoup

In [None]:
# Upewniamy się, że scraper nie pójdzie do kolejnej strony, jeśli bieżąca jest pusta

def is_page_empty(bs) -> bool:
    empty_condition = bs.find('h2', class_='text-white font-weight-bold')

    if empty_condition is None:
        return False
    else:
        return True

In [None]:
# Scrapujemy widełki wynagrodzenia stanowiska

def parse_salary(salary) -> dict:
    bounds = re.findall('[0-9]+', salary.replace(' ', ''))
    low_bound = bounds[0]
    high_bound = bounds[1] if len(bounds) > 1 else bounds[0]
    currency = salary.split()[-1]

    return {'low': low_bound,
            'high': high_bound,
            'currency': currency}

In [None]:
# Scrapujemy miasta

def parse_city(city) -> dict:
    if re.search('(Zdalna)', city):
         _city = "Zdalna"
         country = 'N/A'
    else:       
        (_city, country) = city.split(',')
        country = country.strip()

    return {'city': _city, 'country': country}

In [None]:
def parse_jobs(jobs) -> list:
    results = []
    
    for job in jobs:
        results.append(parse_job(job))

    return results

In [None]:
# Zbieramy wszystkie dane - misasto, widełki, nazwa stanowiska, firmę, stack technologiczny

def parse_job(job) -> dict:
    job_info = job.find('div', class_='posting-info position-relative d-none d-lg-flex flex-grow-1').find_all('span')
    
    salary = job_info[0].text.strip()
    salary_data = parse_salary(salary)
    
    location = job.find('nfj-posting-item-city')
    if location is None:
        location = job_info[1]

    location = location.text.strip()
    location = parse_city(location)

    name = job.find('h2', class_='posting-title__position').text.strip()
    
    company = job.find('span', class_='posting-title__company').text.replace('w', '').strip()

    technology = job.find('a', class_='btn btn-outline-secondary btn-sm text-truncate')
    if technology:
        technology = technology.text.strip()
    else:
        technology = 'N/A'

    return {
        'location_city': location,
        'salary': salary_data,
        'name': name,
        'company': company,
        'technology': technology
    } 

In [None]:
def get_data(page) -> list:
    results = []

    jobs = [x.parent for x in page.find_all('div', class_='posting-image')]
    page_data = parse_jobs(jobs)
    results += page_data

    return results

In [None]:
# Odpalamy wszystkie zapisane wcześniej html i iterujemy w celu zapisania w formie .csv

data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    with open(
        os.path.join(data_dir, entry.name),
        encoding='UTF-8') as f:
            html = f.read()

    job = entry.name.split('_')[0]
    bs = BeautifulSoup(html)
    
    offers = get_data(bs)
    for offer in offers:
        offer['job'] = job
    results += offers

In [None]:
df = pd.json_normalize(results, sep='_')
df.head()

In [None]:
#Zapisujemy w .csv
df.to_csv('../data/interim/job_offers.csv', sep=';', encoding='UTF', index=False)