## Install Required Libraries

In [1]:
# !pip install beautifulsoup4
# !pip install selenium

## Import Required Libraries

In [2]:
import pandas as pd 
from bs4 import BeautifulSoup 
from selenium.webdriver import Chrome
import re 
import time
import json
import math

In [3]:
path = "\jobstreet\chromedriver_win32"
driver = Chrome(executable_path=path)
base_url = "https://www.jobstreet.com.sg/en/job-search/{}-jobs/{}/"

  driver = Chrome(executable_path=path)


## Functions

In [4]:
def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

In [5]:
def get_page_number(keyword):
    #input: keyword for job_postings
    #output: number of pages

    url = base_url.format(keyword, 1)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #Finds the number of search results (Page and Total)
    result_text = soup.find("span",{"class": "sx2jih0 zcydq84u es8sxo0 es8sxo1 es8sxo21 _1d0g9qk4 es8sxo7"})
    
    #Splits the search results into a list
    results = result_text.text.split()
    
    #Replace comma from result and gets the total number of results returned
    result = int(result_text.text.split()[-2].replace(',', ''))
    
    #Gets the number of pages
    page_number = math.ceil(result/30)
    
    #Returns total number of pages
    return page_number

In [6]:
def job_page_scraper(link):

    url = "https://www.jobstreet.com.sg"+link
    print("scraping...", url)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    scripts = soup.find_all("script")

    for script in scripts:
        if script.contents:
            txt = script.contents[0].strip()
            if 'window.REDUX_STATE = ' in txt:
                jsonStr = script.contents[0].strip()
                jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
                jsonStr = jsonStr.split('}}}};')[0].strip()
                jsonStr = jsonStr+"}}}}"
                jsonObj = json.loads(jsonStr)
    
    job = jsonObj['details']
    
    if(job['id']!=''):
        try:
            job_salary_min = job['header']['salary']['min']
            job_salary_max = job['header']['salary']['max']
            job_salary_currency = job['header']['salary']['currency']
        except Exception:
            job_salary_min =''
            job_salary_max = ''
            job_salary_currency = ''

        job_title = job['header']['jobTitle']
        company = job['header']['company']['name']
        job_post_date = job['header']['postedDate']
        job_internship = job['header']['isInternship']
        company_overview = job['companyDetail']['companyOverview']['html']
        company_overview = remove_html_tags(company_overview)
        job_description = job['jobDetail']['jobDescription']['html']
        #Remove html tags
        job_description = remove_html_tags(job_description)
        job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
        job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
        job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
        job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
        job_apply_url = job['applyUrl']['url']
        job_location = job['location'][0]['location']
        job_country = job['sourceCountry']

        return [job_title, job_salary_min, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_overview, job_description, job_requirement_career_level, job_requirement_yearsOfExperience, job_requirement_qualification, job_employment_type, job_apply_url, job_location, job_country]
    else:
        return []

In [7]:
def page_crawler(keyword):
    # input: keyword for job postings
    # output: dataframe of links scraped from each page

    # page number
    page_number = get_page_number(keyword)
    job_links = []

    for n in range(page_number):
        print('Loading page {} ...'.format(n+1))
        url = base_url.format(keyword, n+1)
        #Load URL
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    
        #extract all job links
        links = soup.find_all('a',{'rel':'nofollow noopener noreferrer'})
        job_links += links
 
    jobs = []

    for link in job_links:
        job_link = link['href'].strip().split('?', 1)[0]
        jobs.append(job_page_scraper(job_link))
    
    #Creates dataframe with jobs as values, and columns as column names
    result_df = pd.DataFrame(jobs, columns = ["job_title", "job_salary_min", "job_salary_max", "job_salary_currency", "company", "job_post_date", "job_internship", "company_overview", "job_description", "job_requirement_career_level", "job_requirement_yearsOfExperience", "job_requirement_qualification", "job_employment_type", "job_apply_url", "job_location", "job_country"])
    return result_df

# def main():

#     # a list of job roles to be crawled
#     key_words = ['frontend ux developer morgan']
#     dfs = []

#     for key in key_words:
#         key_df = page_crawler(key)
#         dfs.append(key_df)

#     # save scraped information as csv
#     pd.concat(dfs).to_csv("job_postings_results.csv")

# if __name__ == '__main__':
#     main()

In [8]:
#Request keyword
search_term = input("Enter Search Term: ")

# a list of job roles to be crawled
# key_words = ['frontend ux developer DBS']
key_words = [search_term]
dfs = []

for key in key_words:
    key_df = page_crawler(key)
    dfs.append(key_df)

Enter Search Term: frontend ux developer DBS
Loading page 1 ...
scraping... https://www.jobstreet.com.sg/en/job/vp-avp-reactjs-developer-branch-and-self-serviced-banking-consumer-banking-group-technology-technology-operations-wd39175-jobsthatmatter-10327510
scraping... https://www.jobstreet.com.sg/en/job/vp-avp-reactjs-developer-branch-and-self-serviced-banking-consumer-banking-group-technology-technology-operations-wd39175-jobsthatmatter.-10329892
scraping... https://www.jobstreet.com.sg/en/job/full-stack-developer-1034134405
scraping... https://www.jobstreet.com.sg/en/job/copywriter-1034392814


In [9]:
key_df

Unnamed: 0,job_title,job_salary_min,job_salary_max,job_salary_currency,company,job_post_date,job_internship,company_overview,job_description,job_requirement_career_level,job_requirement_yearsOfExperience,job_requirement_qualification,job_employment_type,job_apply_url,job_location,job_country
0,"VP/AVP, ReactJS Developer, Branch and Self-ser...",,,SGD,DBS Bank Limited,10-Jan-23,False,DBS is a leading financial services group in A...,"Business FunctionAs the leading bank in Asia, ...",Senior Manager,8 years,"Bachelor's Degree, Post Graduate Diploma, Prof...",Full-Time,https://careers.dbs.com/careersection/dbs_prof...,Singapore,sg
1,"VP/AVP, - ReactJS Developer, Branch and Self-s...",,,SGD,DBS Bank Limited,10-Jan-23,False,DBS is a leading financial services group in A...,"Business FunctionAs the leading bank in Asia, ...",Senior Manager,8 years,"Bachelor's Degree, Post Graduate Diploma, Prof...",Full-Time,https://careers.dbs.com/careersection/dbs_prof...,Singapore,sg
2,Full Stack Developer,,,SGD,Tow Me Sg Pte. Ltd.,1 minute ago,False,,Full Stack Developer Job DescriptionAs the ful...,Not Specified,,Not Specified,Full-Time,https://sg.jobsdb.com/job/rd/8216d70245e46abea...,Singapore,sg
3,Copywriter,,,SGD,DDB Worldwide Pte Ltd,14-Jan-23,False,About DDB Group Singapore – DDB Group is one o...,About DDB Group Singapore – DDB Group is one o...,Not Specified,,Not Specified,Full-Time,https://sg.jobsdb.com/job/rd/f9cba4fc20187b8cc...,Kallang,sg


## Convert to JSON

In [10]:
#Export as JSON
key_df.to_json('dataframe2.json', orient='records')

#Get as dictionary
key_df3 = key_df.to_json(orient='records')

In [13]:
key_df3[0]

'['

In [11]:
y2 = json.loads(key_df3)
type(y2[0])
# print(y2[2])

dict