In [43]:
import pandas as pd
import re
import os

from bs4 import BeautifulSoup

data_raw_dir = '../data/raw'

In [44]:
def get_job_list(bs: BeautifulSoup) -> list:
    """
    Function converts file with html code into pieces with information required
    """
    jobs_list_class_attribute = "list-container ng-star-inserted"
    all_offers = bs.findAll("div",attrs={"class":jobs_list_class_attribute})[0]
    
    return all_offers.findAll("a", class_="posting-list-item")

In [45]:
def parse_job_offer(offer, job):
    """
    Assign values with details of a single offer extracted from html code to dictionary 
    """
    #get job name and company name
    jobname_companyname_div = offer.findAll("div", class_="posting-title__wrapper")[0]
    
    name_div = jobname_companyname_div.findAll("h3", class_="posting-title__position")[0]
    name = name_div.getText().strip()
    
    company_html = jobname_companyname_div.findAll("span", class_="posting-title__company")[0]
    company = company_html.getText().strip()
        
    # get technology, salary and location
    offer_details_html = offer.findAll("div",class_="posting-info")[0]
    try:
        technology_html = offer_details_html("common-posting-item-tag")[0].findAll("span")[0]
        technology = technology_html.getText().strip()    
    except Exception:
        technology = ""
        
    try:
        location_html = offer_details_html("nfj-posting-item-city")[0].findAll("span")[0]
        city = location_html.getText().strip()
        country = "N/A"
    
        location = {
            'city': city, "country": country
        }
    except Exception:
        location = {}

    salary_html = offer_details_html.findAll("span", class_="salary")[0]
    salary_txt = salary_html.getText().strip().replace(" ", "").replace("\xa0", "").replace(b'\xe2\x80\x93'.decode(), "-")
    
    currency = salary_txt.split()[-1]
    salary_range = salary_txt.split()[0].split("-")
    low = salary_range[0]
    if len(salary_range) > 1:
        high = salary_range[1]
    else:
        high = low
    
    salary = {
        "low": low, "high": high, "currency": currency,
    }
    
    return {
        'name': name,
        'company': company,
        'technology': technology,
        'job': job,
        'location': location,
        'salary': salary 
    }

In [46]:
# creating a list with offer dictionaries

data_dir = '../data/raw'
results = []

for entry in os.scandir(data_dir):
    print(entry.name)
    if entry.name.find('.html') != -1:
        entry_path = os.path.join(data_dir, entry.name)
        job = entry.name.split('_')[0]
        with open(entry_path, 'r', encoding='UTF-8') as file:
            bs = BeautifulSoup(file.read())
        
        print(f'Liczba ofert na badanej stronie: {len(get_job_list(bs))}')
        
        res = []
        for job_offer in get_job_list(bs):
            res.append(parse_job_offer(job_offer, job))
            
        results.extend(res)

data engineer_2_20231105_210252.html
Liczba ofert na badanej stronie: 20
.DS_Store
data analyst_1_20231105_210252.html
Liczba ofert na badanej stronie: 1
data analyst_0_20231105_210252.html
Liczba ofert na badanej stronie: 20
data engineer_0_20231105_210252.html
Liczba ofert na badanej stronie: 20
.ipynb_checkpoints
data scientist_0_20231105_210252.html
Liczba ofert na badanej stronie: 19
data engineer_1_20231105_210252.html
Liczba ofert na badanej stronie: 20


### Results to pandas dataframe

In [47]:
df = pd.json_normalize(results, sep='_')
df.head()

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency
0,Senior Cloud Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,30000,35000,PLN
1,Senior Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,34000,37000,PLN
2,Mid Data Engineer,Green Minds Sp.z o.o.,Python,data engineer,Zdalnie,,25200,30240,PLN
3,Data Engineer (Snowflake),GetInData | Part of Xebia,Snowflake,data engineer,Zdalnie,,18480,26880,PLN
4,Data Engineer (Dataiku/Linkurious),Sopra Steria - Apps Services,Dataiku,data engineer,Zdalnie,,16000,21000,PLN


In [48]:
df['is_senior'] = 0
df.loc[df['name'].str.lower().str.find('senior') >= 0, 'is_senior'] = 1
df.head()

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency,is_senior
0,Senior Cloud Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,30000,35000,PLN,1
1,Senior Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,34000,37000,PLN,1
2,Mid Data Engineer,Green Minds Sp.z o.o.,Python,data engineer,Zdalnie,,25200,30240,PLN,0
3,Data Engineer (Snowflake),GetInData | Part of Xebia,Snowflake,data engineer,Zdalnie,,18480,26880,PLN,0
4,Data Engineer (Dataiku/Linkurious),Sopra Steria - Apps Services,Dataiku,data engineer,Zdalnie,,16000,21000,PLN,0


In [49]:
df = pd.json_normalize(results, sep='_')
df.head()

Unnamed: 0,name,company,technology,job,location_city,location_country,salary_low,salary_high,salary_currency
0,Senior Cloud Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,30000,35000,PLN
1,Senior Data Engineer,SEB (Skandinaviska Enskilda Banken),Python,data engineer,Warszawa,,34000,37000,PLN
2,Mid Data Engineer,Green Minds Sp.z o.o.,Python,data engineer,Zdalnie,,25200,30240,PLN
3,Data Engineer (Snowflake),GetInData | Part of Xebia,Snowflake,data engineer,Zdalnie,,18480,26880,PLN
4,Data Engineer (Dataiku/Linkurious),Sopra Steria - Apps Services,Dataiku,data engineer,Zdalnie,,16000,21000,PLN


In [50]:
from datetime import datetime
dt = datetime.now().strftime("%Y%m%d_%H%M%S")
df.to_csv(f'../data/interim/job_offers_{dt}.csv', sep=';', encoding='UTF', index=False)

In [51]:
df.shape

(100, 9)