# Scraping seek.com.au for the latest data job information

Load the required modules and 

In [326]:
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from dateutil import parser

URL = "https://www.seek.com.au/data-jobs/in-All-Australia?classification=1209%2C1211%2C6281%2C1223&sortmode=listeddate"

#Page 2 used for testing purposes
#URL = "https://www.seek.com.au/data-jobs/in-All-Australia?classification=1209%2C1211%2C6281%2C1223&page=2&savedsearchid=d3e6e41c-2133-11e9-9978-6b65baccf6af&sortmode=listeddate"

#Make a request for our URL
page = requests.get(URL)

#Read in the page with BeautifulSoup
soup = BeautifulSoup(page.text, "html.parser")

#Full html code-block can be viewed with print(soup.prettify())

I'll start by pulling out **job titles**. All information for each job post falls within div (*class _3MPUOLE*), so this will be reused a lot. The actual job title is listed as the heading

In [327]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for art in div.find_all(name="article"):
            for h1 in art.find_all(name="h1"):
                jobs.append(h1.text)
    return(jobs)

Pulling out **employer names** is a tiny bit more challenging, as some jobs do not list an employer name. These jobs are listed as *Private Advertiser* within a different section (a <span\>, as opposed to an <a\>) to where employer names are usually listed. I ended up checking the span for each div first to see if it was listed as private advertiser, in which case I list it as such and move on to the next div. Otherwise, I grab the employer name for each posting from <a\>.

In [328]:
def extract_employer_from_result(soup): 
    employers = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for span in div.find_all(name="span", attrs={"class":"_3FrNV7v _3PZrylH E6m4BZb"}):
            if "Private Advertiser" in span.text:
                employers.append("Private advertiser")
            else:
                #for art in div.find_all(name="article"):
                for a in span.find_all(name="a", attrs={"class":"_3AMdmRg"}):
                    if a["title"].startswith("Jobs at"):
                        employers.append(a["title"][8:])
    return(employers)

**Locations** were easy to grab, and it seems mandatory for each posting to have a location listed.

In [329]:
def extract_locations_from_result(soup): 
    locations = []
    for span in soup.find_all(name="span", attrs={"class":"Eadjc1o"}):
        if span.text.startswith("location"):
            locations.append(span.text[10:])
    return (locations)

Grabbing the **salary** required adding a placeholder to cover instances where no salary was listed. This is quite common.

In [330]:
def extract_salaries_from_result(soup):
    salaries = []
    divs = []
    for div in soup.find_all(name="div", attrs={"class":"xxz8a1h"}):
        try:
            salary_block = div.find(name="span", attrs={"class":"lwHBT6d"})
            salaries.append(salary_block.text)
        except:
            salaries.append("Not listed")
            
    #Remove any useless info that isn't actually a salary
    for i in range(0,len(salaries)):
        if "$" not in salaries[i]:
            salaries[i] = "Not listed"
            
    #Clean up the data a bit for later
    salaries = [re.sub(re.escape("$"), "", salary) for salary in salaries]
    return(salaries)

Finally, I wanted to grab the **time since the ad was posted**. The top two ad's on the page are **always** featured ads that don't list a time. It could be a task for the next section to pull this information from the job page directly with Selenium, but it's not super important so labelling them as *featured* for now will do.

In [331]:
def extract_time_posted_from_result(soup): 
    time = ['Featured', 'Featured']
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for span in div.find_all(name="span", attrs={"class": "_3FrNV7v _1DHNXoa _1SYpJTv _3PZrylH _2heRYaN E6m4BZb"}):
            time.append(span.text)
    time = [i.split(" ")[0] for i in time]
    return(time)

Quick comparison of the output of each function used to troubleshoot cases where job postings were being missed.

In [332]:
x = len(extract_employer_from_result(soup))
y = len(extract_job_title_from_result(soup))
z = len(extract_locations_from_result(soup))
zx = len(extract_salaries_from_result(soup))
zxc = len(extract_time_posted_from_result(soup))

print ("employers = {}\njob titles = {}\nlocations = {}\nsalaries = {}\ntime = {}".format(x,y,z,zx,zxc))

employers = 22
job titles = 22
locations = 22
salaries = 22
time = 22


Pool everything I've got so far into a dataframe. At this point I could loop through each search page to make a much more comprehensive dataframe, but for now I'll focus on just the first page of strictly the most recent results.

In [333]:
job_df = pd.DataFrame({"time_posted": extract_time_posted_from_result(soup),
                       "job_title": extract_job_title_from_result(soup),
                       "employer": extract_employer_from_result(soup),
                       "location": extract_locations_from_result(soup),
                       "salary": extract_salaries_from_result(soup)
})

In [334]:
job_df.head(5)

Unnamed: 0,time_posted,job_title,employer,location,salary
0,Featured,Data Engineer - Energy Efficiency,Eutility Pty Ltd,Sydney,Not listed
1,Featured,Civil Engineer,East Arm Civil Pty Ltd,Darwin,"100,000 - 149,999"
2,45m,Senior Systems Engineer,Robert Half,Perth,90k - 115k p.a. + super
3,1h,Application Security Specialist,Balance Recruitment,Sydney,Not listed
4,1h,Network Operator,Foxtel,Sydney,Not listed


## Part 2 - Grab job descriptions with Selenium