In [3]:
import time
import random
time.sleep(random.random()*3)

In [4]:
import re
import pandas as pd
from scrapy.selector import Selector
from scrapy.http import HtmlResponse
import requests
from bs4 import BeautifulSoup
import json

In [25]:
salary_ranges = ['0-70000','70000-120000','120000-999999']
# declare an empty list for storing urls
job_urls = {}

for salary_range in salary_ranges:
    job_urls[salary_range]=[]
    for page in range(1,15):
        # iteraterate through the 10 pages of jobs
        r = requests.get("https://www.seek.com.au/BI-jobs?page={0}&salaryrange={1}&salarytype=annual".format(str(page),salary_range))

        # turn into a BeautifulSoup object
        soup = BeautifulSoup(r.text, 'lxml')

        # find urls on page
        urls = soup.find_all(attrs={'data-automation':"jobTitle"})
        
        if urls != []: 
            job_urls[salary_range].extend(urls)

If there is one job which belongs to two or more salary ranges, the concatenated list will be a mess. Thus, I create different dataframes for different salary ranges and then combine them as one.

In [26]:
df_1 = pd.DataFrame()
df_1['Link'] = job_urls['0-70000']
df_1['Salary Range'] = '0-70000'

In [27]:
df_2 = pd.DataFrame()
df_2['Link'] = job_urls['70000-120000']
df_2['Salary Range'] = '70000-120000'

In [28]:
df_3 = pd.DataFrame()
df_3['Link'] = job_urls['120000-999999']
df_3['Salary Range'] = '120000-999999'

In [29]:
url_pd = df_1.append(df_2).append(df_3)

In [30]:
url_pd['Link_1'] =['https://www.seek.com.au' + re.search('(\/job\/[0-9]{8})',i.attrs['href']).group(0) for i in url_pd['Link']]

In [31]:
url_pd['Title'] = [i.text for i in url_pd['Link']]

In [32]:
url_pd = url_pd.drop('Link',axis='columns')

In [33]:
columns = ['Salary Range', 'Link', "Title"]
url_pd.columns=columns

We can obtain a DataFrame which contains the urls for BI jobs for different salary ranges.

In [34]:
url_pd.head(10)

Unnamed: 0,Salary Range,Link,Title
0,0-70000,https://www.seek.com.au/job/39734679,Business Intelligence Developer
1,0-70000,https://www.seek.com.au/job/39749416,BI Developer - 6 Month Contract
2,0-70000,https://www.seek.com.au/job/39748504,Business Analyst - BI
3,0-70000,https://www.seek.com.au/job/39641568,Business Intelligence Analyst (BI)
4,0-70000,https://www.seek.com.au/job/39725154,BI Lead Consultant / Qlikview / Qlik Sense - S...
5,0-70000,https://www.seek.com.au/job/39757708,Experienced SQL + Power BI Developer
6,0-70000,https://www.seek.com.au/job/39738616,IT Analyst - Integration
7,0-70000,https://www.seek.com.au/job/39712539,Business Intelligence Developer - SQL
8,0-70000,https://www.seek.com.au/job/39693139,Senior Power BI Developer (located in Ballarat)
9,0-70000,https://www.seek.com.au/job/39731094,SQL BI Developer - Jnr/Mid level


In [35]:
#for stemming
import re

def replace(string, substitutions):

    substrings = sorted(substitutions, key=len, reverse=True)
    regex = re.compile('|'.join(map(re.escape, substrings)))
    return regex.sub(lambda match: substitutions[match.group(0)], string)

Start scrapying details for each url.

In [44]:
job_listing_date=[]
job_expiry_date=[]
job_title = []
job_teaser=[]
job_advertiser=[]
job_area=[]
job_worktype=[]
job_classification=[]
job_salary=[]
job_salary_type=[]
job_description_details=[]
job_description_strong = []
job_location =[]

for index, job in enumerate(url_pd['Link']):
    # iterate through all the urls
    r = requests.get(job)
    
    # turn into a BeautifulSoup object
    soup = BeautifulSoup(r.text, 'lxml')
    
    # find job descriptions
    jd_data = soup.find_all(attrs={'data-automation':'server-state'})[0]
    jd_raw= jd_data.text[3:].split('\n  ')[1][25:][:-1]
    jd_dict = json.loads(jd_raw)
    jd_dashboard = jd_dict['jobdetails']['result']
    
    if jd_dashboard['listingDate']:
        job_listing_date.append(jd_dashboard['listingDate'])
    else:
        job_listing_date.append('N/A')
        
    if jd_dashboard['expiryDate']:
        job_expiry_date.append(jd_dashboard['expiryDate'])
    else:
        job_expiry_date.append('N/A')
        
    if jd_dashboard['title']:
        job_title.append(jd_dashboard['title'])
    else:
        job_title.append('N/A')
        
    if jd_dashboard['teaser']:
        job_teaser.append(jd_dashboard['teaser'])
    else:
        job_teaser.append('N/A')
        
    if jd_dashboard['advertiser']['description']:
        job_advertiser.append(jd_dashboard['advertiser']['description'])
    else:
        job_advertiser.append('N/A')
        
    if jd_dashboard['locationHierarchy']['area']:
        job_area.append(jd_dashboard['locationHierarchy']['area'])
    else:
        job_area.append('N/A')
        
    if jd_dashboard['workType']:
        job_worktype.append(jd_dashboard['workType'])
    else:
        job_worktype.append('N/A')
        
    if jd_dashboard['classification']['description']:
        job_classification.append(jd_dashboard['classification']['description'])
    else:
        job_classification.append('N/A')
        
    if jd_dashboard['salary']:
        job_salary.append(jd_dashboard['salary'])
    else:
        job_salary.append('N/A')
        
    if jd_dashboard['salaryType']:
        job_salary_type.append(jd_dashboard['salaryType'])
    else:
        job_salary_type.append('N/A')
        
    if jd_dashboard['locationHierarchy']['city']:
        job_location.append(jd_dashboard['locationHierarchy']['city'])
    else:
        job_location.append('N/A')
    
    jd_detail = soup.find(attrs={'data-automation':'mobileTemplate'})
    job_description_details.append(jd_detail)
    

In [45]:
url_pd['Listing Date'] = job_listing_date
url_pd['Expiry Date'] =job_expiry_date
url_pd['Job Title']=job_title
url_pd['Job Teaser']=job_teaser
url_pd['Advertiser'] = job_advertiser
url_pd['Area'] = job_area
url_pd['Work Type'] = job_worktype
url_pd['Classification'] = job_classification
url_pd['Salary'] = job_salary
url_pd['Salary Type'] = job_salary_type
url_pd['JD'] = job_description_details
url_pd['Location'] =job_location

Check the type for 'JD' column and start cleaning.

In [46]:
type(job_description_details)

list

In [47]:
job_description_clean=[]
job_description_strong = []
for i in range(0,len(url_pd['JD'])):
    try:
        strong_word = job_description_details[i].find_all('strong')
        strong_word_list = [u.get_text(strip=True) for u in strong_word if strong_word != False]
        job_description_strong.append(strong_word_list)

        string = job_description_details[i].get_text()
        repla = {u'\xa0':'  ',u'\xe2\x80\x9d':'  ', u'\n':'  '}
        job_description_clean.append(replace(string,repla))
    except:
        job_description_strong.append('N/A')
        job_description_clean.append('N/A')

In [48]:
url_pd['Job Description'] = job_description_clean
url_pd['Strong Words'] = job_description_strong 

In [49]:
BI_jobs = url_pd.copy()

In [50]:
BI_jobs.to_csv('BI_scraping.csv')

In [51]:
BI_jobs

Unnamed: 0,Salary Range,Link,Title,Listing Date,Expiry Date,Job Title,Job Teaser,Advertiser,Area,Work Type,Classification,Salary,Salary Type,JD,Location,Job Description,Strong Words
0,0-70000,https://www.seek.com.au/job/39734679,Business Intelligence Developer,2019-08-19T04:12:50.000Z,2019-09-18T04:12:50.000Z,Business Intelligence Developer,The Business Intelligence Developer is respons...,Peoplecare,,Full Time,Information & Communication Technology,,AnnualPackage,[[We are on the hunt for a well-rounded and ta...,"Wollongong, Illawarra & South Coast",We are on the hunt for a well-rounded and tale...,"[ETL/Business Intelligence Developer, Essentia..."
1,0-70000,https://www.seek.com.au/job/39749416,BI Developer - 6 Month Contract,2019-08-20T10:09:26.000Z,2019-09-19T13:59:59.000Z,BI Developer - 6 Month Contract,Contino are a global organisation and one of t...,Contino,,Contract/Temp,Information & Communication Technology,,AnnualPackage,[[Contino are a global organisation and one of...,Sydney,Contino are a global organisation and one of t...,"[Requirements, Benefits]"
2,0-70000,https://www.seek.com.au/job/39748504,Business Analyst - BI,2019-08-20T07:26:07.000Z,2019-09-19T13:59:59.000Z,Business Analyst - BI,provides support for major Works Division prov...,Talent – Winner ‘Seek Large Recruitment Agency...,"CBD, Inner West & Eastern Suburbs",Contract/Temp,Information & Communication Technology,Competitive,AnnualPackage,"[[], Talent International is working in partne...",Sydney,Talent International is working in partnership...,"[Key Accountabilities, Experience Required, Wo..."
3,0-70000,https://www.seek.com.au/job/39641568,Business Intelligence Analyst (BI),2019-08-06T05:25:46.000Z,2019-09-05T13:59:59.000Z,Business Intelligence Analyst (BI),"Pegasus is expanding rapidly, our BI Analyst w...",Pegasus Management Pty Ltd,,Full Time,Information & Communication Technology,Salary + Super,AnnualPackage,"[[[About Pegasus]], [Pegasus is one of the fas...","Newcastle, Maitland & Hunter",About PegasusPegasus is one of the fastest gro...,"[About Pegasus, The Pegasus team, The Role, Ke..."
4,0-70000,https://www.seek.com.au/job/39725154,BI Lead Consultant / Qlikview / Qlik Sense - S...,2019-08-16T07:00:11.000Z,2019-09-15T13:59:59.000Z,BI Lead Consultant / Qlikview / Qlik Sense - S...,BI Lead Consultant / Qlikview / Qlik Sense - S...,Nigel Frank International PTY LTD,,Full Time,Information & Communication Technology,,AnnualPackage,[[Qlikview Consultant / Qlik Sense - Sydney - ...,Sydney,Qlikview Consultant / Qlik Sense - Sydney - Co...,[Experience sought:]
5,0-70000,https://www.seek.com.au/job/39757708,Experienced SQL + Power BI Developer,2019-08-21T06:33:24.000Z,2019-09-20T13:59:59.000Z,Experienced SQL + Power BI Developer,ABOUT USThe EDGE10 Group is the award-winning ...,EDGE10 (UK) Ltd,,Full Time,Sport & Recreation,,AnnualPackage,"[[], [[ABOUT US], []], [The EDGE10 Group is th...",Melbourne,ABOUT USThe EDGE10 Group is the award-winning ...,"[ABOUT US, THE JOB, Requirements, Benefits]"
6,0-70000,https://www.seek.com.au/job/39738616,IT Analyst - Integration,2019-08-19T07:56:07.000Z,2019-09-18T13:59:59.000Z,IT Analyst - Integration,Excellent 12m FTC Business Intelligence Analys...,Sirius Technology Melbourne part of Sirius Peo...,Bayside & South Eastern Suburbs,Full Time,Information & Communication Technology,,AnnualPackage,[You will be joining a subsidiary of one of th...,Melbourne,You will be joining a subsidiary of one of the...,"[Your Benefits:, Your role:, You will need:]"
7,0-70000,https://www.seek.com.au/job/39712539,Business Intelligence Developer - SQL,2019-08-15T04:56:54.000Z,2019-09-14T04:56:53.000Z,Business Intelligence Developer - SQL,Put those BI and analytical skills to work in ...,Pareto Phone,CBD & Inner Suburbs,Full Time,Information & Communication Technology,,AnnualPackage,[[Are you a developer/analyst looking for more...,Brisbane,Are you a developer/analyst looking for more o...,"[The Role:, Key Selection Criteria, Desired Ex..."
8,0-70000,https://www.seek.com.au/job/39693139,Senior Power BI Developer (located in Ballarat),2019-08-13T04:22:08.000Z,2019-09-12T13:59:59.000Z,Senior Power BI Developer (located in Ballarat),An opportunity for an experience Business Inte...,Davidson Technology,,Full Time,Information & Communication Technology,,AnnualPackage,"[[About the Company], [], Davidson Technology...",Ballarat & Central Highlands,About the Company Davidson Technology have rec...,"[About the Company, About the Role, About You,..."
9,0-70000,https://www.seek.com.au/job/39731094,SQL BI Developer - Jnr/Mid level,2019-08-19T00:40:05.000Z,2019-09-18T00:40:05.000Z,SQL BI Developer - Jnr/Mid level,Dynamic Valley based organisation | Career opp...,Digital Native,,Full Time,Information & Communication Technology,,AnnualPackage,[Our client is a progressive and dynamic organ...,Brisbane,Our client is a progressive and dynamic organi...,"[Responsibilities:, Technical skills:]"
