In [3]:
import requests

from requests_html import HTMLSession

from bs4 import BeautifulSoup

s = HTMLSession()
url = 'https://pythonjobs.github.io/'
response = requests.get(url)

def getdata(url):
    r = s.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

if response.status_code == 200:
    # Successful request
    html_content = response.content
    print('Connected Successfully')
else:
    print('Request failed with status code:', response.status_code)

Connected Successfully


In [4]:
# Instead of printing all the HTML. You can print a limited amount of lines.
## For this I needed to actually see everything so I increased the line count

soup = getdata(url)

limit_lines = 2000
print('\n'.join(str(line) for line in soup.prettify().splitlines()[:limit_lines]))

<!DOCTYPE html>
<!-- https://github.com/paulirish/html5-boilerplate/blob/master/index.html -->
<!-- paulirish.com/2008/conditional-stylesheets-vs-css-hacks-answer-neither/ -->
<!--[if lt IE 7 ]> <html lang="en" class="no-js ie6"> <![endif]-->
<!--[if IE 7 ]>    <html lang="en" class="no-js ie7"> <![endif]-->
<!--[if IE 8 ]>    <html lang="en" class="no-js ie8"> <![endif]-->
<!--[if (gte IE 9)|!(IE)]><!-->
<html class="no-js" lang="en">
 <!--<![endif]-->
 <head>
  <!-- meta element for compatibility mode needs to be before
        all elements except title & meta
        msdn.microsoft.com/en-us/library/cc288325(VS.85).aspx -->
  <meta charset="utf-8"/>
  <!-- Always force latest IE rendering engine (even in intranet) & Chrome Frame
       Remove this if you use the .htaccess -->
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <title>
   The Free Python Job Board
  </title>
  <meta content="An international job board for Python roles" name="description"/>
  <!--  Because we h

In [5]:
# This prints out the Element text from each h1 object
# We skip the first one because it is just the title of the page, and not an actual job title

job_card = soup.find_all('div', class_='job')

for job_element in job_card:
    title_element = job_element.find("h1")
    location_element = job_element.find_all("span", class_="info")
    company_element = location_element[3].get_text(strip=True) if len(location_element) > 3 else "N/A" # all the "i class" spans are called the same thing. The fourth one is the company so we specify [3]
    detail_element = job_element.find("p", class_="detail")

    print("Title:", title_element.text.strip())
    print("Location:", location_element[0].text.strip()) # all the "i class" spans are called the same thing. The first one is the actual location
    print("Company:", company_element)
    print("Details:", detail_element.text.strip())
    print()  # Add an empty line for spacing between job listings

Title: Strats Python Developer
Location: London, UK
Company: HBK Europe Management LLP
Details: Overview HBK is searching for a Python software developer to join our Strats team in London on a full-time basis. The Strats group works closely and primarily with investment professionals in all our offices to help...

Title: Python Software Developer
Location: Remote, UK-only
Company: Open Data Services Co-operative
Details: We’re hiring a Python Software Developer to join our interdisciplinary team, working with data publishers and users. To find out more about this role and working at Open Data Services check out this twitter thread....

Title: Senior Software Engineer, Back-End (Remote)
Location: Galway, Ireland, Remote
Company: Oomnitza
Details: Oomnitza offers enterprise IT a unique solution to manage the entirety of the digital estate. Unlike our competitors, who deliver siloed solutions, Oomnitza offers granular control and orchestration across the...

Title: Python Backend Engineer

In [6]:
# Inside a list of elements you can specify a tag directly

# job_card = s.find_all('div', class_='job')

for job_element in job_card:
    print(job_element.h1.text)

Strats Python Developer
Python Software Developer
Senior Software Engineer, Back-End (Remote)
Python Backend Engineer
Senior Backend Engineer


In [7]:
# Extracting the links from the site

for job_element in job_card:
    link_url = job_element.find("a")["href"]
    print(f"Read More: {link_url}")
    print()

Read More: /jobs/hbk-strats-developer.html

Read More: /jobs/open-data-services-co-operative-python-software-developer.html

Read More: /jobs/oomnitza-back-end-sw-enginneer-irl-remote.html

Read More: /jobs/bmat-python-backend-engineer.html

Read More: /jobs/bmat-senior-backend-engineer.html



In [8]:
def getnextpage(soup):
    links = []
    go_buttons = soup.find_all("a", class_='go_button')
    for go in go_buttons:
        link = 'https://pythonjobs.github.io' + go["href"]
        links.append(link)
    return links

# Example usage:
soup = getdata(url)
link_list = getnextpage(soup)
for link in link_list:
    print(link)

https://pythonjobs.github.io/jobs/hbk-strats-developer.html
https://pythonjobs.github.io/jobs/open-data-services-co-operative-python-software-developer.html
https://pythonjobs.github.io/jobs/oomnitza-back-end-sw-enginneer-irl-remote.html
https://pythonjobs.github.io/jobs/bmat-python-backend-engineer.html
https://pythonjobs.github.io/jobs/bmat-senior-backend-engineer.html


In [34]:
def get_job_details(link):
    response = requests.get(link)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')

        title_element = soup.find("h1")
        company_element = soup.find("div", class_="head").find("a").text.strip()

        posted_date_element = ""
        contract_type_element = ""
        location_element = ""
        tags = []

        div_elements = soup.find_all("div", class_="head")[1:]
        for div_element in div_elements:
            if div_element.find("span", class_="i-calendar"):
                posted_date_element = div_element.text.strip().split(":")[1].strip()
            elif div_element.find("span", class_="i-chair"):
                contract_type_element = div_element.text.strip().split(":")[1].strip()
                location_element = div_element.text.strip().split(":")[2].strip()

        tags_element = soup.find_all("a", class_="tag")
        tags = [tag.text.strip() for tag in tags_element]

        contact_name_element = soup.find("div", class_="contact").find("div", class_="field").find("span")
        email_element = soup.find("div", class_="contact").find_all("div", class_="field")[1].find("a")
        website_element = soup.find("div", class_="contact").find_all("div", class_="field")[2].find("a")
        phone_element = soup.find("div", class_="contact").find_all("div", class_="field")[-1].find("span")

        contact_name = contact_name_element.text.strip() if contact_name_element else ""
        email = email_element.text.strip() if email_element else ""
        website = website_element.get("href") if website_element else ""
        phone = phone_element.text.strip() if phone_element else ""

        description_element = soup.find("div", class_="body").find("h1", string="Overview")
        description = description_element.find_next_sibling("p").text.strip() if description_element else ""

        job_details = {
            "Title": title_element.text.strip(),
            "Company": company_element,
            "Posted Date": posted_date_element,
            "Contract Type": contract_type_element,
            "Location": location_element,
            "Tags": tags,
            "Contact Name": contact_name,
            "Email": email,
            "Website": website,
            "Phone": phone,
            "Description": description,
        }

        return job_details
    else:
        return {}




def process_job(job_element):
    title_element = job_element.find("h1")
    location_element = job_element.find_all("span", class_="info")
    company_element = location_element[3].get_text(strip=True) if len(location_element) > 3 else "N/A"
    detail_element = job_element.find("p", class_="detail")

    print("Title:", title_element.text.strip())
    print("Location:", location_element[0].text.strip())
    print("Company:", company_element)

    link = 'https://pythonjobs.github.io/' + job_element.find("a", class_="go_button")["href"]
    job_details = get_job_details(link)

    print("Website:", job_details.get("Website", "N/A"))
    print("Contact Name:", job_details.get("Contact Name", "N/A"))
    print("Email:", job_details.get("Email", "N/A"))

    tags = job_details.get("Tags", [])
    if tags:
        print("Tags:", ', '.join(tags))

    print("Details:", detail_element.text.strip())
    print()



job_card = soup.find_all('div', class_='job')
for job_element in job_card:
    process_job(job_element)

Title: Strats Python Developer
Location: London, UK
Company: HBK Europe Management LLP
Website: https://www.hbk.com/
Contact Name: Pav
Email: pandriychenko@hbk.com
Tags: python, finance, london, pandas, jupyter, sql
Details: Overview HBK is searching for a Python software developer to join our Strats team in London on a full-time basis. The Strats group works closely and primarily with investment professionals in all our offices to help...

Title: Python Software Developer
Location: Remote, UK-only
Company: Open Data Services Co-operative
Website: https://app.beapplied.com/apply/6cvrnot04p?utm_source=pythonjobs
Contact Name: Ben Webb
Email: ben.webb+please+apply+through+website@opendataservices.coop
Tags: opendata, opensource, cooperative, python
Details: We’re hiring a Python Software Developer to join our interdisciplinary team, working with data publishers and users. To find out more about this role and working at Open Data Services check out this twitter thread....

Title: Senior S