# Scraping seek.com.au for the latest data job information

Load the required modules and 

In [79]:
import os
import requests
import bs4
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
from dateutil import parser

URL = "https://www.seek.com.au/data-jobs/in-All-Australia?classification=1209%2C1211%2C6281%2C1223&sortmode=listeddate"

#Page 2 used for testing purposes
#URL = "https://www.seek.com.au/data-jobs/in-All-Australia?classification=1209%2C1211%2C6281%2C1223&page=2&savedsearchid=d3e6e41c-2133-11e9-9978-6b65baccf6af&sortmode=listeddate"

#Make a request for our URL
page = requests.get(URL)

#Read in the page with BeautifulSoup
soup = BeautifulSoup(page.text, "html.parser")

#Full html code-block can be viewed with print(soup.prettify())

I'll start by pulling out **job titles**. All information for each job post falls within div (*class _3MPUOLE*), so this will be reused a lot. The actual job title is listed as the heading

In [80]:
def extract_job_title_from_result(soup): 
    jobs = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for art in div.find_all(name="article"):
            for h1 in art.find_all(name="h1"):
                jobs.append(h1.text)
    return(jobs)

Pulling out **employer names** is a tiny bit more challenging, as some jobs do not list an employer name. These jobs are listed as *Private Advertiser* within a different section (a <span\>, as opposed to an <a\>) to where employer names are usually listed. I ended up checking the span for each div first to see if it was listed as private advertiser, in which case I list it as such and move on to the next div. Otherwise, I grab the employer name for each posting from <a\>.

In [81]:
def extract_employer_from_result(soup): 
    employers = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for span in div.find_all(name="span", attrs={"class":"_3FrNV7v _3PZrylH E6m4BZb"}):
            if "Private Advertiser" in span.text:
                employers.append("Private advertiser")
            else:
                #for art in div.find_all(name="article"):
                for a in span.find_all(name="a", attrs={"class":"_3AMdmRg"}):
                    if a["title"].startswith("Jobs at"):
                        employers.append(a["title"][8:])
    return(employers)

**Locations** were easy to grab, and it seems mandatory for each posting to have a location listed.

In [82]:
def extract_locations_from_result(soup): 
    locations = []
    for span in soup.find_all(name="span", attrs={"class":"Eadjc1o"}):
        if span.text.startswith("location"):
            locations.append(span.text[10:])
    return (locations)

Grabbing the **salary** required adding a placeholder to cover instances where no salary was listed. This is quite common.

In [83]:
def extract_salaries_from_result(soup):
    salaries = []
    divs = []
    for div in soup.find_all(name="div", attrs={"class":"xxz8a1h"}):
        try:
            salary_block = div.find(name="span", attrs={"class":"lwHBT6d"})
            salaries.append(salary_block.text)
        except:
            salaries.append("NaN")
            
    #Remove any useless info that isn't actually useful salary information
    for i in range(0,len(salaries)):
        if "$" not in salaries[i]:
            salaries[i] = "NaN"
            
    #Clean up the data a bit for later
    salaries = [re.sub(re.escape("$"), "", salary) for salary in salaries]
    return(salaries)

I wanted to grab the **time since the ad was posted**. The top two ad's on the page are **always** featured ads that don't list a time. It could be a task for the next section to pull this information from the job page directly with Selenium, but it's not super important so labelling them as *featured* for now will do. It seems also that when posts are over 1 day old they appear in a new class, so we loop through that after grabbing our first set.

In [84]:
def extract_time_posted_from_result(soup): 
    time = ['Featured', 'Featured']
    #for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
    for span in soup.find_all(name="span", attrs={"class": "_3FrNV7v _1DHNXoa _1SYpJTv _3PZrylH _2heRYaN E6m4BZb"}):
        time.append(span.text)
    for span in soup.find_all(name="span", attrs={"class": "_3FrNV7v _1DHNXoa _3PZrylH _2heRYaN E6m4BZb"}):
        time.append(span.text)
    time = [i.split(" ")[0] for i in time]
    return(time)

In [85]:
extract_time_posted_from_result(soup)

['Featured',
 'Featured',
 '1h',
 '1h',
 '1h',
 '5h',
 '6h',
 '13h',
 '13h',
 '13h',
 '17h',
 '17h',
 '20h',
 '22h',
 '23h',
 '23h',
 '1d',
 '1d',
 '1d',
 '1d',
 '1d',
 '1d']

Finally I'll grab the url's (href) for each.

In [86]:
def grab_urls_from_result(soup): 
    base_url = "http://www.seek.com.au"
    urls = []
    for div in soup.find_all(name="div", attrs={"class":"_3MPUOLE"}):
        for art in div.find_all(name="article"):
            for h1 in art.find_all(name='h1'):
                for a in h1.find_all(name="a"):
                    urls.append(base_url + a['href'])
    return(urls)
grab_urls_from_result(soup)

['http://www.seek.com.au/job/38144608?type=promoted&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38177007?type=promoted&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38310299?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38310293?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38310283?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38310064?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38310051?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38309931?type=standout&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://www.seek.com.au/job/38309917?type=standard&searchrequesttoken=04b61032-5f25-4b5a-af1f-7b607d106ef0',
 'http://w

Quick comparison of the output of each function used to troubleshoot cases where job postings were being missed.

In [87]:
x = len(extract_employer_from_result(soup))
y = len(extract_job_title_from_result(soup))
z = len(extract_locations_from_result(soup))
zx = len(extract_salaries_from_result(soup))
zxc = len(extract_time_posted_from_result(soup))
zxcv = len(grab_urls_from_result(soup))

print ("employers = {}\njob titles = {}\nlocations = {}\nsalaries = {}\ntime = {}\nurls = {}".format(x,y,z,zx,zxc,zxcv))

employers = 22
job titles = 22
locations = 22
salaries = 22
time = 22
urls = 22


Pool everything I've got so far into a dataframe. At this point I could loop through each search page to make a much more comprehensive dataframe, but for now I'll focus on just the first page of strictly the most recent results.

In [88]:
job_df = pd.DataFrame({"time_posted": extract_time_posted_from_result(soup),
                       "job_title": extract_job_title_from_result(soup),
                       "employer": extract_employer_from_result(soup),
                       "location": extract_locations_from_result(soup),
                       "salary": extract_salaries_from_result(soup),
                       "url": grab_urls_from_result(soup)
})

In [89]:
job_df.head(5)

Unnamed: 0,time_posted,job_title,employer,location,salary,url
0,Featured,Data Engineer - Energy Efficiency,Eutility Pty Ltd,Sydney,,http://www.seek.com.au/job/38144608?type=promo...
1,Featured,Electrical engineer; Industrial IoT,Collective Intelligence Group,Sydney,,http://www.seek.com.au/job/38177007?type=promo...
2,1h,Process Control Network Engineer,Yokogawa,Perth,,http://www.seek.com.au/job/38310299?type=stand...
3,1h,Senior .NET Developer for large Queensland Gov...,AgileDev Solutions Pty Ltd,Brisbane,,http://www.seek.com.au/job/38310293?type=stand...
4,1h,"Clinical Psychologist | 3 month contract, poss...",Sugarman Australia,Brisbane,50 - 55 p.h. + Super + Additional Benefits,http://www.seek.com.au/job/38310283?type=stand...


## Part 2 - Grab job descriptions with Selenium

In [90]:
from selenium import webdriver
import time

It seems each paragraph <p\> is contained within a div that can belong to one of two classes. Here i've simply iterated over each, checked if they've grabbed any info, and if so append them to my list.

In [91]:
driver = webdriver.Chrome('C:/Users/chris/chromedriver.exe')

descriptions = []
for url in grab_urls_from_result(soup):
    driver.get(url)
    info = driver.find_elements_by_xpath('//div[@class="templatetext"]')
    
    if len(info) != 0:
        descriptions.append(info[0].text)
    else:
        info = driver.find_elements_by_xpath('//div[@class="_2e4Pi2B"]')
        if len(info) != 0:
            descriptions.append(info[0].text)
        else:
            descriptions += "NaN"
    time.sleep(2)

Add the descriptions to a new column in my dataframe.

In [92]:
job_df['description'] = descriptions
#job_df.to_excel("jobs.xlsx")

### Description Search

The full job descriptions end up being a bit long-winded, but really I just want to see if a few key words are present in the description, then maybe I'll have a closer look at the job description directly.

In [93]:
def key_word_search(descriptions):
    '''takes a list of jobs descriptions and 
       checks for the presence of any keywords'''

    key_words = ('junior', 'graduate', 'python', ' R ', ' R.', 'scripting')
    temp = []
    words_found = []
    
    for i in descriptions:
        for word in key_words:
            if word in i.lower():
                temp.append(word)
        if len(temp) == 0:
            words_found.append('NaN')
        else:
            words_found.append(', '.join(temp))
            temp = []
    return(words_found) 

Similarly it would be good to know if the job requires a certain amount of experience at a glance, so I'll do a quick search of each description and if anything matches I'll save it.

In [94]:
def experience_search(descriptions):
    '''takes a list of job descriptions and
    searches for the required amount of experience'''
    
    experience = []
    for description in descriptions:
        try:
            experience.append(re.search(r'\d+[+]* years*', description).group()) #probably think of better regex here
        except:
            experience.append('NaN')
    return(experience)

Append the keywords and experience to new columns in a dataframe.

In [95]:
job_df['experience'] = experience_search(descriptions)
job_df['key_words'] = key_word_search(descriptions)

Save the dataframe to an excel file and open it.

In [96]:
job_df.to_excel("jobs.xlsx", index=False)
os.startfile("jobs.xlsx")

In [97]:
job_df.head(10)

Unnamed: 0,time_posted,job_title,employer,location,salary,url,description,experience,key_words
0,Featured,Data Engineer - Energy Efficiency,Eutility Pty Ltd,Sydney,,http://www.seek.com.au/job/38144608?type=promo...,Eutility have a new role within an expanding e...,3 years,python
1,Featured,Electrical engineer; Industrial IoT,Collective Intelligence Group,Sydney,,http://www.seek.com.au/job/38177007?type=promo...,The Collective Intelligence Group is a global ...,3+ years,"python, scripting"
2,1h,Process Control Network Engineer,Yokogawa,Perth,,http://www.seek.com.au/job/38310299?type=stand...,The Company\nYokogawa Australia and New Zealan...,10 years,
3,1h,Senior .NET Developer for large Queensland Gov...,AgileDev Solutions Pty Ltd,Brisbane,,http://www.seek.com.au/job/38310293?type=stand...,AgileDev solutions (www.agiledevsolutions.com....,10 years,
4,1h,"Clinical Psychologist | 3 month contract, poss...",Sugarman Australia,Brisbane,50 - 55 p.h. + Super + Additional Benefits,http://www.seek.com.au/job/38310283?type=stand...,The purpose of this role is to provide clinica...,,
5,5h,Registered Psychologist | South Brisbane | Chi...,Sugarman Australia,Brisbane,80k - 85k p.a. + Super + Salary sacrifice and mo,http://www.seek.com.au/job/38310064?type=stand...,As a successful Registered Psychologist you wi...,,
6,6h,TECHNICAL BUSINESS ANALYST!,Saul Recruitment,Sydney,,http://www.seek.com.au/job/38310051?type=stand...,"Master Data Management!\nEstablished, reputabl...",3 years,
7,13h,Senior Developer,Tabcorp,Sydney,,http://www.seek.com.au/job/38309931?type=stand...,Our Company\nTabcorp is a world-class diversif...,5+ years,
8,13h,Information Management Technical Specialist,AMP,Sydney,,http://www.seek.com.au/job/38309917?type=stand...,Technology and Operation’s vision is to delive...,10+ years,
9,13h,IT Security Advisor,"Department of Environment, Land, Water and Pla...",Melbourne,,http://www.seek.com.au/job/38309865?type=stand...,We are building an inclusive workplace to help...,,
