<a href="https://colab.research.google.com/github/AuraFrizzati/wecare-jobs-webscraper/blob/main/wecare_jobs_webscraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Webscraper for job adverts from "wecare.wales"**
https://wecare.wales/jobs/results

In [None]:
## Load relevant libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import math
import re
import html
import datetime

# main webscraping function
def scrape_wecare_jobs(url, page_num):
  response = requests.get(url)
  response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
  soup = BeautifulSoup(response.content, 'html.parser')

  job_articles = soup.find_all('article')
  jobs_data = []  # List to store job data as dictionaries

  for job_item in job_articles:


    # extract title
    title_element = job_item.find('h2', class_='font-h4')
    title = title_element.text.strip()

    # extract the closing date
    closing_date_element = job_item.find('p', string=re.compile(r'Closing date:'))
    if closing_date_element:
        closing_date = closing_date_element.text.split(': ')[1].strip()
    else:
        closing_date = 'N/A'

    # extract company, location, job type and salary
    detail_items = job_item.find('ul', class_='list-none').find_all('li')
    company = detail_items[0].text.strip()
    location = detail_items[1].text.strip()
    job_type = detail_items[2].text.strip()
    salary = detail_items[3].text.strip().replace('Â', '')

    # extract job description
    description_element = job_item.find('div', class_='small-text')
    description = description_element.text.strip()

    # extract job tag
    job_tag = job_item.select('span.tag.meta.flex-shrink-0[class*="bg-"]')
    job_tag_text = ', '.join([tag.text.strip() for tag in job_tag])

    # extract job link
    job_details_link_element = job_item.find('div', class_='bg-cyan-20').find('a', class_='button')
    link = job_details_link_element['href']

    jobs_data.append({
    'title': title
    ,'closing_date': closing_date
    ,'company': company
    ,'location': location
    ,'job_type': job_type
    ,'salary': salary
    ,'description': description
    ,'Job_tag':  job_tag_text
    ,'link_id': link
    ,'page_scraped': page_num
    ,'date_of_scraping': datetime.datetime.now().strftime("%Y-%m-%d")
    })

  return pd.DataFrame(jobs_data)

## Initialise parameters and dataframe
base_url = "https://wecare.wales/jobs/results/"
max_jobs_per_page = 12
all_jobs_df = pd.DataFrame()  # Initialize an empty DataFrame to store all jobs

print(f"webscraping of {base_url}")

# Get the total number of posted jobs in the day
response = requests.get(base_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, 'html.parser')
total_jobs = int(soup.find('h1').text.split(' ')[0])  # h1 contains the total job count
print(f"There are a total of {total_jobs} job adverts posted")

# calculate the number of pages to retrieve
num_pages = math.ceil(total_jobs / max_jobs_per_page)
print(f"There are {num_pages} pages to retrieve")

# Loop through each page
print(".....................")
print("Webscraping started")

all_jobs_df = pd.DataFrame()  # Initialize an empty DataFrame to store all jobs
# for page_num in range(1, 3):
for page_num in range(1, num_pages + 1):
      page_url = f"{base_url}p{page_num}" if page_num > 1 else base_url
      print(f"Scraping data from: {page_url}")
      page_job_df = scrape_wecare_jobs(page_url, page_num)
      all_jobs_df = pd.concat([all_jobs_df, page_job_df], ignore_index=True)

print(".....................")
print("Webscraping completed")

#all_jobs_df.head()

## Convert dataframe to csv file a csv file

print(".....................")
print("Creation of CSV file")

if not all_jobs_df.empty:
    print("All job adverts found and saved to CSV:")
    #print(all_jobs_df.head())
    today_date = datetime.datetime.now().strftime("%Y-%m-%d")  # Get today's date in YYYY-MM-DD format
    csv_filename = f"wecare_jobs_webscraped_{today_date}.csv"  # Include date in filename
    all_jobs_df.to_csv(csv_filename, index=False, encoding='utf-8')
    print(f"Data saved to {csv_filename}")
else:
    print("No job adverts found on any page, or an error occurred.")


print(".....................")
print("Process ended")

webscraping of https://wecare.wales/jobs/results/
There are a total of 269 job adverts posted
There are 23 pages to retrieve
.....................
Webscraping started
Scraping data from: https://wecare.wales/jobs/results/
Scraping data from: https://wecare.wales/jobs/results/p2
Scraping data from: https://wecare.wales/jobs/results/p3
Scraping data from: https://wecare.wales/jobs/results/p4
Scraping data from: https://wecare.wales/jobs/results/p5
Scraping data from: https://wecare.wales/jobs/results/p6
Scraping data from: https://wecare.wales/jobs/results/p7
Scraping data from: https://wecare.wales/jobs/results/p8
Scraping data from: https://wecare.wales/jobs/results/p9
Scraping data from: https://wecare.wales/jobs/results/p10
Scraping data from: https://wecare.wales/jobs/results/p11
Scraping data from: https://wecare.wales/jobs/results/p12
Scraping data from: https://wecare.wales/jobs/results/p13
Scraping data from: https://wecare.wales/jobs/results/p14
Scraping data from: https://weca