In [2]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta

In [3]:
def dict_creator(
    publication_date, company, job_title, position, website_name, link_url
):
    offers_dict = dict()
    offers_dict["publication_date"] = publication_date
    offers_dict["company"] = company
    offers_dict["title"] = job_title
    offers_dict["position"] = position
    offers_dict["website"] = website_name
    offers_dict["link_url"] = link_url

    return offers_dict

In [4]:
def get_nofluffjobs_job_details(job_element):
    company_element = job_element.find(
        "span", class_="d-block posting-title__company text-truncate"
    )
    company = company_element.text.strip()
    job_title_element = job_element.find(
        "h3",
        class_="posting-title__position text-truncate color-main ng-star-inserted",
    )
    job_title = job_title_element.text.strip()

    link_url = "https://nofluffjobs.com" + job_element["href"]
    pattern = re.compile(r"https?://([\w.\.\-]+)")
    website_name = pattern.match(link_url)[0]
    page_job_element = requests.get(link_url)
    soup_page = BeautifulSoup(page_job_element.content, "html.parser")
    position = soup_page.find("span", class_="mr-10 font-weight-medium").text.strip()
    publication_date_element = soup_page.find("div", class_="posting-time-row")

    days_after_publication = re.findall(
        r"\b\d+\b", publication_date_element.text.strip()
    )
    if len(days_after_publication) == 0:
        publication_date = datetime.today().strftime("%Y-%m-%d")

    else:
        publication_date = (
            datetime.today() - (timedelta(days=int(days_after_publication[0])))
        ).strftime("%Y-%m-%d")

    return (publication_date, company, job_title, position, website_name, link_url)


In [5]:
def nofluffjobs_page_job_offers(
    url="https://nofluffjobs.com/pl/praca-zdalna/python?criteria=city%3Dwarszawa%20seniority%3Dtrainee,junior%20%20salary<pln12000m&page=1",
) -> list:

    nofluffjobs_list = list()

    flag = True
    try:
        page = requests.get(url)
    except requests.exceptions.ConnectionError as err:
        flag = False
    except requests.exceptions.MissingSchema as err:
        flag = False
    if flag:
        soup = BeautifulSoup(page.content, "html.parser")
        job_elements = soup.select('a[class*="posting-list-item posting-list-item--"]')
        for job_element in job_elements:

            (
                publication_date,
                company,
                job_title,
                position,
                website_name,
                link_url,
            ) = get_nofluffjobs_job_details(job_element)
            nofluffjobs_dict = dict_creator(
                publication_date, company, job_title, position, website_name, link_url
            )
            nofluffjobs_list.append(nofluffjobs_dict)
    return nofluffjobs_list


In [6]:
import pandas as pd
import numpy as np

nofluffjobs_list = nofluffjobs_page_job_offers()
df_raw = pd.DataFrame.from_records(nofluffjobs_list)
df = df_raw.copy()
df['publication_date'] = pd.to_datetime(df['publication_date'], infer_datetime_format=True)
df.drop_duplicates(subset=['publication_date', 'company', 'title'], inplace=True, ignore_index=True)




In [7]:
df

Unnamed: 0,publication_date,company,title,position,website,link_url
0,2022-09-13,ImpiCode,Junior Full Stack Developer Remote,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-full-sta...
1,2022-09-11,ImpiCode,Junior Full Stack Developer Remote,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-full-sta...
2,2022-09-11,Astrotectonic Sp. z o.o.,Junior Fullstack Developer,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-fullstac...
3,2022-09-09,Amartus Polska Sp. z o.o.,Junior Phyton Developer,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-phyton-d...
4,2022-09-09,IDEA team within Narodowe Centrum Badań Jądrowych,Junior Python Developer,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-python-d...
5,2022-09-11,Antal Poland,Remote Junior DevOps,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/remote-junior-d...
6,2022-09-11,Jamf,DevOps Cloud Engineer I (Signing Bonus),Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/devops-cloud-en...
7,2022-09-09,Deloitte Advisory,AI LAB,Trainee,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/ai-lab-deloitte...
8,2022-09-08,Walksee,Junior Python AI Developer,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/junior-python-a...
9,2022-09-08,Formeld Poland,Remote Junior Test Engineer with German,Junior,https://nofluffjobs.com,https://nofluffjobs.com/pl/job/remote-junior-t...


In [None]:
df.columns

Index(['publication_date', 'company', 'title', 'position', 'link_url',
       'website'],
      dtype='object')