In [20]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from bs4 import BeautifulSoup
import requests
import re

# import sleep
from time import sleep

plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [21]:
def getting_data(url):
    ###
    ###  This function send request to the web server
    ###  Get the response back, Check Status Code
    ###  Grab HTML and turns it to Beautiful Soup
    ###
    
    #send request to the url given
    response = requests.get(url)
    
    if response.status_code == 200:
        #received the data basck
        html = response.text
    
        #parse HTML to Beautiful Soup
        soup = BeautifulSoup(html, 'lxml')
    
        return soup
    
    else:
        return None

In [22]:
def get_job_summary(link):
    
    ###
    ###  This function get the Link to the Job Ad
    ###  Get the response back and Extract Job Summary
    ###  Turns it to Beautiful Soup and Extract Text
    ###
    
    # open that page and get the job summary
    url = baseurl + link
    soup = getting_data(url)
    
    try:
        jobDescription = soup.find('div', {'data-automation':'mobileTemplate'})
        
        return jobDescription.get_text()
        
    except:
        
        return None

In [23]:
def get_job_details(job_article):
    
    ###
    ###  Each job advertised on Seek on the list displayed is called Article
    ###  This function receive one article at a time to extract job related info
    ###  Then return it as a list of data
    ###
    
    
    try:
        #send one job article at a time to extract each job data
        job_ad = job_article.find('a', {'data-automation':'jobTitle'})
        title = job_ad.text
        link = job_ad['href']
        #print ( 'title:', title)
        #print ( 'link:', link)

        job_ad = job_article.find('span', {'data-automation':'jobSalary'})
        if (job_ad is None):
            salary = 'NA'
        else:
            salary = job_ad.text
        #print ('salary:', salary)

        job_ad = job_article.find('a', {'data-automation':'jobCompany'})
        if (job_ad is None):
            company = 'NA'
        else:
            company = job_ad.text
        #print ('company:', company)

        job_ad = job_article.find('a', {'data-automation':'jobLocation'})
        if (job_ad is None):
            location = 'NA'
        else:
            location = job_ad.text
        #print ('location:', location)


        job_ad = job_article.find('a', {'data-automation':'jobClassification'})
        if (job_ad is None):
            classification = 'NA'
        else:
            classification = job_ad.text
        #print ('classification:', classification)
        
        
        jobsummary = get_job_summary(link)
        
        return [title, link, salary, location, company, classification, jobsummary]

    except:
        
        return None

In [24]:
def get_job_ads_no(jobSearch):
    
    job_listings = []
    
    
    #send the html request for this job search
    html = getting_data(jobSearch + '1')
    
    # get no of toal jobs found
    jobcounts = html.find('strong',{'data-automation':'totalJobsCount'})
    
    # extract the jobs no
    jobsfound = int(jobcounts.text.replace(",", ""))

    #calculate how many pages need to collect the job listing
    pages =  int(jobsfound / 20) + 1
    
    print (jobsfound, pages)
    
    
    return pages

In [25]:
def get_job_articles(jobSearch, startPage, endPage):
    
    articleList = []
    # getting job listing for each page
    # each job is an article
    for p in np.arange(startPage,endPage):

        pageUrl = jobSearch + str(p + 1)
        print (pageUrl)
        
        # wait one second before getting a page
        sleep(1)
        html = getting_data(pageUrl)
        
        # find each job listing
        articleListPreminum = html.find_all('article',{'data-automation':'premiumJob'})
        articleList.extend(articleListPreminum)
        
        articleListNormal = html.find_all('article',{'data-automation':'normalJob'})
        articleList.extend(articleListNormal)
    
    return articleList

In [26]:
def get_df_Article(articleList):

    ###
    ###  This function recieved list of articles, then split the article list into half
    ###  Then feed an article at a time into another function to extract job info
    ###  Once processed return dataframes
    ###
    # ------------------------------------------------
    # Process the articles
    # ------------------------------------------------   
    job_postings = []
    
    for i in range(len(articleList)):
        new_postings = get_job_details(articleList[i])
        
        if new_postings is not None:
            job_postings.append(new_postings)    
    
    #print (len(job_postings))
    
    # now create a dataframe frome the data
    df = pd.DataFrame(job_postings, columns=['title', 'link', 'salary', 'location', 'company', 'classification', 'summary'])
    
    print ('finish process article')
    
    
    return df

In [35]:
### 
### Define variables related to the URL for web scraping on SEEK
### Each sets are group based on Salary and to make it scrape in smaller chunks
###
baseurl = "https://www.seek.com.au/"

sixty_suffix = '?salarytype=annual&page='
eighty_suffix = '?salaryrange=60000-80000&salarytype=annual&page='
hundred_suffix = '?salaryrange=80000-100000&salarytype=annual&page='
onetwenty_suffix = '?salaryrange=100000-120000&salarytype=annual&page='
onefifty_suffix = '?salaryrange=120000-150000&salarytype=annual&page='
twohundred_suffix = '?salaryrange=150000-200000&salarytype=annual&page='
twofifty_suffix = '?salaryrange=200000-999999&salarytype=annual&page='

dataUrl = baseurl + "software-engineer-jobs/in-New-South-Wales-NSW/"

dataSixty = dataUrl + sixty_suffix
dataEighty = dataUrl + eighty_suffix
dataHundred = dataUrl + hundred_suffix
dataOnetwenty = dataUrl + onetwenty_suffix
dataOnefifty = dataUrl + onefifty_suffix
dataTwohundred = dataUrl + twohundred_suffix
dataTwofifty = dataUrl + twofifty_suffix

In [36]:
# For those salary between 0 - 60K

# get job ads numbers
pages = get_job_ads_no(dataSixty)

# get articles in chucks based on number of pages, make it 10 pages or 200~220 jobs increment at a time
for startPage in np.arange(0, pages, 10):
    
    endPage = startPage + 10
    articleList = get_job_articles(dataSixty, startPage, endPage)
    
    # sometimes Seek cuts off the link - no more articles are returned
    if not articleList:
        print("List is empty")
        #break
    else:
        new_df = get_df_Article(articleList)
        if startPage == 0:
            df = new_df
        else:
            df = pd.concat([df, new_df])
            print(len(df))

2238 112
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=1
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=2
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=3
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=4
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=5
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=6
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=7
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=8
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=9
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=10
finish process article
https:/

https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=84
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=85
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=86
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=87
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=88
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=89
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=90
finish process article
1870
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=91
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=92
https://www.seek.com.au/software-engineer-jobs/in-New-South-Wales-NSW/?salarytype=annual&page=93
ht

In [37]:
df.to_csv('./datasets/dataSixty.csv')
df.shape

FileNotFoundError: [Errno 2] No such file or directory: './datasets/dataSixty.csv'