In [19]:
from time import sleep
from datetime import datetime
from bs4 import BeautifulSoup
from pathlib import Path
from selenium.webdriver import Chrome
from selenium.webdriver.chrome.service import Service as ChromeService
import requests

In [20]:
def download_page(page_number: int, job_name: str) -> BeautifulSoup:
    """
    Enters the website and downloads all its content
    """
    global browser
    url = f"https://nofluffjobs.com/pl/?criteria=jobPosition%3D'{job_name}'&page={page_number}"
    browser.get(url)
    return BeautifulSoup(browser.page_source)

In [21]:
def page_without_next_page_arrow(page: BeautifulSoup) -> bool:
    """
    Checks if the page is empty/the last one
    Condition: there's no "next page" arrow on the website
    """
    res = False
    next_page_arrows = page.findAll("a", attrs={"aria-label": "Next"})
    if not next_page_arrows:
        res = True
    return res

def other_offers_of_interest(page: BeautifulSoup) -> bool:
    """
    Checks if the page contains "Mogą Cię też zainteresować" 
    """
    res = False
    tags = page.findAll("h1", attrs={"class": "list-title"})
    for tag in tags:
        if tag.getText() == ' Mogą Cię też zainteresować ':
            res = True
    return res

In [22]:
def get_job_offers(job_name: str) -> list:
    """
    Downloads current job offers for a query

    @param job_name: search query
    @returns: list with dictionary {page - page number, job - job posting, html - original html of a website}
    """
    results = []
    
    page_number = 1
    while True:
        page = download_page(page_number, job_name)
        warunek_strona_ostatnia = page_without_next_page_arrow(page) or other_offers_of_interest(page)
        results.append(page)
        if warunek_strona_ostatnia:
            break
        page_number = page_number + 1
    return results

In [23]:
drivers_path = Path('../drivers/chromedriver')  
drivers_path.resolve()

service = ChromeService(r'../drivers/chromedriver') 
browser = Chrome(service=service)
browser.implicitly_wait(10)

In [24]:
jobs = ['data analyst', 'data scientist', 'data engineer']

# dowload info for queries listed in "jobs"
job_offers = {}
for job in jobs:
    job_offers[job] = get_job_offers(job)

In [26]:
# save info about jobs to html file 
dt = datetime.now().strftime("%Y%m%d_%H%M%S")
for job in job_offers:
    for page_num, page in enumerate(job_offers[job]):
        with open(f"../data/raw/{job}_{page_num}_{dt}.html", "w", encoding='utf-8') as f:
            f.write(BeautifulSoup.prettify(page, formatter='html'))
browser.quit()