In [130]:
import pandas as pd
from bs4 import BeautifulSoup
import glob
import re
import os
import datetime

In [123]:
def parse_location(location):
    if re.search("Zdalna", location):
        city = "Zdalna"
        country = "N/A"
    else:
        city, country = location.split(",")
        city = city.strip("$").strip()
        country = country.strip()[:3]
    return {'city': city, 'country': country}

In [101]:
def parse_salary(salary):
    bounds = re.findall(r"[0-9]+", salary.replace(" ", ""))
    currency = salary.split()[-1]
    return {'low': bounds[0], 'high': bounds[1], 'currency': currency}

In [126]:
def parse_job(offer):
    name = offer.find("h3", class_="posting-title__position").text.strip()
    company = offer.find("span", class_="posting-title__company").text.replace("@", "").strip()
    offer_info = offer.find("div", class_="posting-info")
    salary = offer_info.find("span", class_="salary").text.strip()
    salary_data = parse_salary(salary)
    location = offer_info.find("span", class_="posting-info__location").text.strip()
    location_data = parse_location(location)
    technology = offer_info.find("a")
    
    if technology:
        technology = technology.text.strip()
    else:
        technology = "N/A"
    
    return {
        'name': name,
        'company': company,
        'technology': technology,
        'salary': salary_data,
        'location': location_data
    }

In [137]:
data_dir = "../data/raw"

result = []
for entry in glob.glob(f"{data_dir}/*"):
    with open(entry) as f:
        html = f.read()
    
    job = os.path.basename(entry).split("_")[0]
    bs = BeautifulSoup(html)
    offers = bs.find_all("a", class_="posting-list-item")
    
    for offer in offers:
        parsed_offer = parse_job(offer)
        parsed_offer['job'] = job
        result.append(parsed_offer)

In [139]:
df = pd.json_normalize(result, sep="_")
df.head()

Unnamed: 0,name,company,technology,job,salary_low,salary_high,salary_currency,location_city,location_country
0,Data Scientist,Kontomatik Sp. z o.o.,python,data scientist,8000,12000,PLN,Warszawa,POL
1,Data scientist/mathematical modeller,IDEA team within Narodowe Centrum Badań Jądrowych,,data scientist,7000,13000,PLN,Warszawa,POL
2,Data Scientist,Devire,python,data scientist,8000,16000,PLN,Zdalna,
3,Data Scientist,MindMics.com,python,data scientist,8000,15000,PLN,Kraków,POL
4,Data Scientist,Orange Polska,python,data scientist,9000,18000,PLN,Warszawa,POL


In [140]:
df.shape

(93, 9)

In [141]:
df.to_csv(
    f"../data/interim/job_offers_{datetime.datetime.now().date()}.csv",
    sep=";",
    encoding="UTF-8",
    index=False
)