In [1]:
import time
import random
time.sleep(random.random()*3)

In [2]:
import re
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests
from bs4 import BeautifulSoup
import json

In [22]:
salary_ranges = ['0-70000','70000-120000','120000-999999']
# declare an empty list for storing urls
job_urls = {}

for salary_range in salary_ranges:
    job_urls[salary_range]=[]
    for page in range(1,20):
        # iteraterate through the 10 pages of jobs
        r = requests.get("https://www.seek.com.au/Data-Engineer-jobs?page={0}&salaryrange={1}&salarytype=annual".format(str(page),salary_range))

        # turn into a BeautifulSoup object
        soup = BeautifulSoup(r.text, 'lxml')

        # find urls on page
        urls = soup.find_all(attrs={'data-automation':"jobTitle"})
        
        if urls != []: 
            job_urls[salary_range].extend(urls)

If there is one job which belongs to two or more salary ranges, the concatenated list will be a mess. Thus, I create different dataframes for different salary ranges and then combine them as one.

In [23]:
df_1 = pd.DataFrame()
df_1['Link'] = job_urls['0-70000']
df_1['Salary Range'] = '0-70000'

In [24]:
df_2 = pd.DataFrame()
df_2['Link'] = job_urls['70000-120000']
df_2['Salary Range'] = '70000-120000'

In [25]:
df_3 = pd.DataFrame()
df_3['Link'] = job_urls['120000-999999']
df_3['Salary Range'] = '120000-999999'

In [26]:
url_pd = df_1.append(df_2).append(df_3)

In [27]:
url_pd['Link_1'] =['https://www.seek.com.au' + re.search('(\/job\/[0-9]{8})',i.attrs['href']).group(0) for i in url_pd['Link']]

In [28]:
url_pd['Title'] = [i.text for i in url_pd['Link']]

In [29]:
url_pd = url_pd.drop('Link',axis='columns')

In [30]:
columns = ['Salary Range', 'Link', "Title"]
url_pd.columns=columns

We can obtain a DataFrame which contains the urls for BI jobs for different salary ranges.

In [31]:
url_pd.head(10)

Unnamed: 0,Salary Range,Link,Title
0,0-70000,https://www.seek.com.au/job/39690163,Junior Data Engineer
1,0-70000,https://www.seek.com.au/job/39749280,Data Engineer - 6 Month Contract
2,0-70000,https://www.seek.com.au/job/39663464,Junior Data Engineer
3,0-70000,https://www.seek.com.au/job/39661130,Agile Data Engineer (x2)
4,0-70000,https://www.seek.com.au/job/39688588,Big Data Engineer
5,0-70000,https://www.seek.com.au/job/39663050,Data Engineer
6,0-70000,https://www.seek.com.au/job/39638588,Big Data Engineer
7,0-70000,https://www.seek.com.au/job/39551948,Data Engineer
8,0-70000,https://www.seek.com.au/job/39581505,Big Data Engineer
9,0-70000,https://www.seek.com.au/job/39754088,Data Centre Engineer


In [32]:
#for stemming
import re

def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

Start scrapying details for each url.

In [34]:
job_listing_date=[]
job_expiry_date=[]
job_title = []
job_teaser=[]
job_advertiser=[]
job_area=[]
job_worktype=[]
job_classification=[]
job_salary=[]
job_salary_type=[]
job_description_details=[]
job_description_strong = []
job_location =[]

for index, job in enumerate(url_pd['Link']):
    # iterate through all the urls
    r = requests.get(job)
    
    # turn into a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'lxml')
    
    # find job descriptions
    jd_data = soup.find_all(attrs={'data-automation':'server-state'})[0]
    jd_raw= jd_data.text[3:].split('\n  ')[1][25:][:-1]
    jd_dict = json.loads(jd_raw)
    jd_dashboard = jd_dict['jobdetails']['result']
    
    if jd_dashboard['listingDate']:
        job_listing_date.append(jd_dashboard['listingDate'])
    else:
        job_listing_date.append('N/A')
        
    if jd_dashboard['expiryDate']:
        job_expiry_date.append(jd_dashboard['expiryDate'])
    else:
        job_expiry_date.append('N/A')
        
    if jd_dashboard['title']:
        job_title.append(jd_dashboard['title'])
    else:
        job_title.append('N/A')
        
    if jd_dashboard['teaser']:
        job_teaser.append(jd_dashboard['teaser'])
    else:
        job_teaser.append('N/A')
        
    if jd_dashboard['advertiser']['description']:
        job_advertiser.append(jd_dashboard['advertiser']['description'])
    else:
        job_advertiser.append('N/A')
        
    if jd_dashboard['locationHierarchy']['area']:
        job_area.append(jd_dashboard['locationHierarchy']['area'])
    else:
        job_area.append('N/A')
        
    if jd_dashboard['workType']:
        job_worktype.append(jd_dashboard['workType'])
    else:
        job_worktype.append('N/A')
        
    if jd_dashboard['classification']['description']:
        job_classification.append(jd_dashboard['classification']['description'])
    else:
        job_classification.append('N/A')
        
    if jd_dashboard['salary']:
        job_salary.append(jd_dashboard['salary'])
    else:
        job_salary.append('N/A')
        
    if jd_dashboard['salaryType']:
        job_salary_type.append(jd_dashboard['salaryType'])
    else:
        job_salary_type.append('N/A')
        
    if jd_dashboard['locationHierarchy']['city']:
        job_location.append(jd_dashboard['locationHierarchy']['city'])
    else:
        job_location.append('N/A')
    
    jd_detail = soup.find(attrs={'data-automation':'mobileTemplate'})
    job_description_details.append(jd_detail)
    

In [35]:
url_pd['Listing Date'] = job_listing_date
url_pd['Expiry Date'] =job_expiry_date
url_pd['Job Title']=job_title
url_pd['Job Teaser']=job_teaser
url_pd['Advertiser'] = job_advertiser
url_pd['Area'] = job_area
url_pd['Work Type'] = job_worktype
url_pd['Classification'] = job_classification
url_pd['Salary'] = job_salary
url_pd['Salary Type'] = job_salary_type
url_pd['JD'] = job_description_details
url_pd['Location'] =job_location

Check the type for 'JD' column and start cleaning.

In [36]:
type(job_description_details)

list

In [37]:
job_description_clean=[]
job_description_strong = []
for i in range(0,len(url_pd['JD'])):
    try:
        strong_word = job_description_details[i].find_all('strong')
        strong_word_list = [u.get_text(strip=True) for u in strong_word if strong_word != False]
        job_description_strong.append(strong_word_list)

        string = job_description_details[i].get_text()
        repla = {u'\xa0':'  ',u'\xe2\x80\x9d':'  ', u'\n':'  '}
        job_description_clean.append(replace(string,repla))
    except:
        job_description_strong.append('N/A')
        job_description_clean.append('N/A')

In [38]:
url_pd['Job Description'] = job_description_clean
url_pd['Strong Words'] = job_description_strong 

In [39]:
data_engineer_jobs = url_pd.copy()

In [40]:
data_engineer_jobs.to_csv('data_engineer_scraping.csv')

In [41]:
data_engineer_jobs.tail(10)

Unnamed: 0,Salary Range,Link,Title,Listing Date,Expiry Date,Job Title,Job Teaser,Advertiser,Area,Work Type,Classification,Salary,Salary Type,JD,Location,Job Description,Strong Words
370,120000-999999,https://www.seek.com.au/job/39745017,Quality Automation Tester (API Testing - Rest/...,2019-08-20T04:00:23.000Z,2019-09-19T13:59:59.000Z,Quality Automation Tester (API Testing - Rest/...,Permanent Role - Quality Eye for The Team Wor...,Revolution IT,,Full Time,Information & Communication Technology,,AnnualPackage,"[[ , [ , <div><strong>About the Company:</stro...",Sydney,About the Company:Revolution IT is a leading...,"[About the Company:, About the Role:, , Requir..."
371,120000-999999,https://www.seek.com.au/job/39676957,Product Owner,2019-08-11T14:40:10.000Z,2019-09-11T13:59:59.000Z,Product Owner,A data driven Product Owner to work on a large...,Woolworth Liquor Group,"CBD, Inner West & Eastern Suburbs",Full Time,Information & Communication Technology,,AnnualPackage,"[[ ], [ ], , [[About Us]], , [The Endeavour ...",Sydney,About Us The Endeavour Drinks Group ambiti...,"[About Us, Product Owner - Endeavour X, Furthe..."
372,120000-999999,https://www.seek.com.au/job/39575722,Python Developers,2019-07-29T04:05:46.000Z,2019-08-28T04:05:46.000Z,Python Developers,We have multiple positions for Python Develope...,Zone IT Solutions,"CBD, Inner West & Eastern Suburbs",Full Time,Information & Communication Technology,"$100,000 - $149,999",AnnualPackage,[[We are hiring multiple Python Developers for...,Sydney,We are hiring multiple Python Developers for S...,"[Required Skills and experience:, About Zone I..."
373,120000-999999,https://www.seek.com.au/job/39598892,SCADA Engineer,2019-07-31T07:31:31.000Z,2019-08-30T13:59:59.000Z,SCADA Engineer,SCADA analyst/engineer bridge real-time tech p...,Paul Ingle & Associates,CBD & Inner Suburbs,Contract/Temp,Information & Communication Technology,,HourlyRate,[[SCADA Analyst/Engineer to establish and main...,Melbourne,SCADA Analyst/Engineer to establish and mainta...,[Expertise and Skills Requirements]
374,120000-999999,https://www.seek.com.au/job/39656857,MINE PRODUCTION ENGINEER,2019-08-08T00:51:33.000Z,2019-09-07T00:51:33.000Z,MINE PRODUCTION ENGINEER,We are looking for a pro-active Production Eng...,Sandfire Resources NL,,Full Time,"Mining, Resources & Energy",,AnnualPackage,"[[[<strong>Who we are</strong>]], , [[Sandfir...","Geraldton, Gascoyne & Midwest",Who we are Sandfire is a leading Australian co...,"[Who we are, Our People, The Role:, Production..."
375,120000-999999,https://www.seek.com.au/job/39587578,IBM MDM Engineer - Multiple Multi Year Contrac...,2019-07-30T06:01:21.000Z,2019-08-29T13:59:59.000Z,IBM MDM Engineer - Multiple Multi Year Contrac...,IBM MDM Advanced Edition - Largest Project in ...,TRS - Accounting & Finance,CBD & Inner Suburbs,Contract/Temp,Information & Communication Technology,$700 > $1000 p/d,HourlyRate,"[Our client is looking to hire experienced , [...",Melbourne,Our client is looking to hire experienced IBM...,"[IBM MDM Engineer, Contract and Permanent Appl..."
376,120000-999999,https://www.seek.com.au/job/39763542,Senior Civil Designer,2019-08-22T02:29:18.000Z,2019-09-21T02:29:17.000Z,Senior Civil Designer,Senior Civil Designer required to help a growi...,Technical Resources Pty Ltd,"CBD, Inner & Western Suburbs",Contract/Temp,Engineering,Permanent staff or contract available,HourlyRate,"[[[About the Business and Role]], , [Senior C...",Perth,About the Business and Role Senior Civil Desig...,"[About the Business and Role, Responsibilities..."
377,120000-999999,https://www.seek.com.au/job/39574660,Senior Technical Business Analyst,2019-07-29T02:54:12.000Z,2019-08-28T13:59:59.000Z,Senior Technical Business Analyst,A leading Australian Banking Institution curre...,Huxley Associates,,Contract/Temp,Banking & Financial Services,competitive,HourlyRate,[[A leading Australian Banking Institution cur...,Sydney,A leading Australian Banking Institution curre...,[The candidate MUST have Financial Services ex...
378,120000-999999,https://www.seek.com.au/job/39761345,Data Analyst,2019-08-22T00:04:11.000Z,2019-09-21T13:59:59.000Z,Data Analyst,Seeking a passionate Data Analyst who loves wo...,GFG Alliance,"CBD, Inner West & Eastern Suburbs",Full Time,Information & Communication Technology,,AnnualPackage,"[[GFG Alliance], has an agile, entrepreneuria...",Sydney,"GFG Alliance has an agile, entrepreneurial cul...","[GFG Alliance, About our team, About the role,..."
379,120000-999999,https://www.seek.com.au/job/39723365,Control Systems Engineer,2019-08-16T05:40:08.000Z,2019-09-15T05:40:07.000Z,Control Systems Engineer,"Market leading, global provider of intelligent...",Kapsch TrafficCom Australia,"CBD, Inner West & Eastern Suburbs",Full Time,Engineering,,AnnualPackage,"[[[About us]], , [Kapsch TrafficCom is a mark...",Sydney,About us Kapsch TrafficCom is a market leading...,"[About us, The role, About you]"
