### Web scraping seek for data job information

Grab information for the newest data related jobs from seek.

In [44]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

URL = "https://www.seek.com.au/data-jobs/in-All-Australia?classification=1209%2C1211%2C6281%2C1223&sortmode=listeddate"

In [45]:
#Make a request for our URL
page = requests.get(URL)
#Read in the page with BeautifulSoup
soup = BeautifulSoup(page.text, "html.parser")

#Uncomment to view the blcok of html
#print(soup.prettify())

**Pull out job titles**

In [48]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for art in div.find_all(name="article"):
            for h1 in art.find_all(name="h1"):
                jobs.append(h1.text)
    return(jobs)
extract_job_title_from_result(soup)

['Data Engineer - Energy Efficiency',
 'Electrical engineer; Industrial IoT',
 'Project Officer (ICT Requirement Review)',
 'Health Information Manager/Clinical Coder',
 'Data Warehouse Developer',
 'Senior, Quality and Improvement Coordinator',
 'Head of IT Operations',
 'Data Reporting Analyst',
 'Aged Care Quality Coordinator',
 'Power BI Solution Analyst',
 'Creative Experience Team Lead',
 'Security & Communications Installer',
 'Senior SQL Developer',
 'ICT System Administrator',
 'Technical Application Developer',
 'Data Scientist, Analytics, Machine Learning - Data Scientist, Data Statistician',
 'Service Desk and Senior Service Desk Technician',
 'C# Developer - Front Office Trading',
 'Senior Software Developer',
 'Senior ETL Developer',
 'Business Systems Consultant',
 'Full Stack Developer']

**Pull out employer names**  
This is a tiny bit more challenging, articles with class *_3AMdmRg* reliably give the employer names, but they contain some entries we're not interested in. The employers are preceded with a "Jobs at ___" prefix, so we select based on this and slice it out.

In [36]:
def extract_employer_from_result(soup): 
    employers = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for art in div.find_all(name="article"):
            for a in art.find_all(name="a", attrs={"class":"_3AMdmRg"}):
                if a["title"].startswith("Jobs at"):
                    employers.append(a["title"][8:])
    return(employers)
extract_employer_from_result(soup)

['Eutility Pty Ltd',
 'Collective Intelligence Group',
 'Arafmi Queensland Inc.',
 'St John of God Health Care',
 'Tabcorp',
 'Melbourne Health',
 'PM-Partners Group',
 'Hays Recruitment',
 'Healthcare Australia',
 'Perigon Group',
 'Map Talent Group',
 'Woodside Energy',
 'Helpdesk Computer Systems',
 'Aurec Human Capital',
 'Biviano Direct',
 'Woodside Energy',
 'Infinity Pro',
 'Kinetic IT',
 'GQR Global Markets',
 'Optimum Consulting',
 'Robert Walters',
 'Aurec Human Capital']

**Grab each location**

In [37]:
def extract_locations_from_result(soup): 
    locations = []
    for span in soup.find_all(name="span", attrs={"class":"Eadjc1o"}):
        if span.text.startswith("location"):
            locations.append(span.text[10:])
    return (locations)

In [38]:
extract_locations_from_result(soup)

22

**Grab salary (if present**  
This is a little more difficult, as a lot of postings don't list a salary so we'll need a placeholder for those that don't.

In [125]:
def extract_salaries_from_result(soup):
    salaries = []
    divs = []
    for div in soup.find_all(name="div", attrs={"class":"xxz8a1h"}):
        try:
            salary_block = div.find(name="span", attrs={"class":"lwHBT6d"})
            salaries.append(salary_block.text)
        except:
            salaries.append("Not listed")
    #Clear out any nonsense
    for i in range(0,len(salaries)):
        if "$" not in salaries[i]:
            salaries[i] = "Not listed"
    #Remove dollar signs, punctuation
    salaries = [re.sub(re.escape("$"), "", salary) for salary in x]
    return(salaries)

In [127]:
job_df = pd.DataFrame({"job_title": extract_job_title_from_result(soup),
                       "employer": extract_employer_from_result(soup),
                       "location": extract_locations_from_result(soup),
                       "salary": extract_salaries_from_result(soup)
})

In [128]:
job_df

Unnamed: 0,job_title,employer,location,salary
0,Data Engineer - Energy Efficiency,Eutility Pty Ltd,Sydney,Not listed
1,Electrical engineer; Industrial IoT,Collective Intelligence Group,Sydney,Not listed
2,Project Officer (ICT Requirement Review),Arafmi Queensland Inc.,Brisbane,Not listed
3,Health Information Manager/Clinical Coder,St John of God Health Care,"Bendigo, Goldfields & Macedon Ranges",Not listed
4,Data Warehouse Developer,Tabcorp,Brisbane,Not listed
5,"Senior, Quality and Improvement Coordinator",Melbourne Health,Melbourne,Not listed
6,Head of IT Operations,PM-Partners Group,Sydney,Not listed
7,Data Reporting Analyst,Hays Recruitment,Darwin,Not listed
8,Aged Care Quality Coordinator,Healthcare Australia,Adelaide,Not listed
9,Power BI Solution Analyst,Perigon Group,Sydney,Not listed
