## Install Required Libraries

In [24]:
# !pip install beautifulsoup4
# !pip install selenium

## Import Required Libraries

In [79]:
import pandas as pd 
from bs4 import BeautifulSoup 
from selenium.webdriver import Chrome
import re 
import time
import json
import math

In [88]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'}

path = "\jobstreet\chromedriver_win32"

driver = Chrome(executable_path=path)
# time.sleep(2)
base_url = "https://www.jobstreet.com.sg/en/job-search/{}-jobs/{}/"

def remove_html_tags(text):
    """Remove html tags from a string"""
    import re
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

def get_page_number(keyword):
    #input: keyword for job_postings
    #output: number of pages

    url = base_url.format(keyword, 1)
    #print(url)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    #Finds the number of search results (Page and Total)
    result_text = soup.find("span",{"class": "sx2jih0 zcydq84u es8sxo0 es8sxo1 es8sxo21 _1d0g9qk4 es8sxo7"})
    print(result_text)
    
    #Splits the search results into a list
    results = result_text.text.split()
    print(results)
    
    #Replace comma from result and gets the total number of results returned
    result = int(result_text.text.split()[-2].replace(',', ''))
    page_number = math.ceil(result/30)
    
    return page_number

def job_page_scraper(link):

    url = "https://www.jobstreet.com.sg"+link
    print("scraping...", url)
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, 'html.parser')

    scripts = soup.find_all("script")
    #print(scripts)

    for script in scripts:
        if script.contents:
            txt = script.contents[0].strip()
            if 'window.REDUX_STATE = ' in txt:
                jsonStr = script.contents[0].strip()
                jsonStr = jsonStr.split('window.REDUX_STATE = ')[1].strip()
                jsonStr = jsonStr.split('}}}};')[0].strip()
                jsonStr = jsonStr+"}}}}"
                jsonObj = json.loads(jsonStr)
    
    job = jsonObj['details']
    #print(job)
    
    if(job['id']!=''):    
        #job_id = job['id']
        #print(job_id)
        #job_expired = job['isExpired']
        #job_confidential = job['isConfidential']

        try:
            job_salary_min = job['header']['salary']['min']
            job_salary_max = job['header']['salary']['max']
            job_salary_currency = job['header']['salary']['currency']
        except Exception:
            job_salary_min =''
            job_salary_max = ''
            job_salary_currency = ''

        job_salary_min = job['header']['salary']['min']
        job_salary_max = job['header']['salary']['max']
        job_salary_currency = job['header']['salary']['currency']

        job_title = job['header']['jobTitle']

        company = job['header']['company']['name']
        #print(company)
        job_post_date = job['header']['postedDate']
        job_internship = job['header']['isInternship']
        #company_website = job['companyDetail']['companyWebsite']
        #company_avgProcessTime = job['companyDetail']['companySnapshot']['avgProcessTime']
        #company_registrationNo = job['companyDetail']['companySnapshot']['registrationNo']
        #company_workingHours = job['companyDetail']['companySnapshot']['workingHours']
        #company_facebook = job['companyDetail']['companySnapshot']['facebook']
        #company_size = job['companyDetail']['companySnapshot']['size']
        #company_dressCode = job['companyDetail']['companySnapshot']['dressCode']
        #company_nearbyLocations = job['companyDetail']['companySnapshot']['nearbyLocations']
        company_overview = job['companyDetail']['companyOverview']['html']
        company_overview = remove_html_tags(company_overview)
        
        job_description = job['jobDetail']['jobDescription']['html']
        #Remove html tags
        job_description = remove_html_tags(job_description)
        
        
        #job_summary = job['jobDetail']['summary']
        job_requirement_career_level = job['jobDetail']['jobRequirement']['careerLevel']
        job_requirement_yearsOfExperience = job['jobDetail']['jobRequirement']['yearsOfExperience']
        job_requirement_qualification = job['jobDetail']['jobRequirement']['qualification']
        #job_requirement_fieldOfStudy = job['jobDetail']['jobRequirement']['fieldOfStudy']
        #job_requirement_industry = job['jobDetail']['jobRequirement']['industryValue']['label']
        #job_requirement_skill = job['jobDetail']['jobRequirement']['skills']
        job_employment_type = job['jobDetail']['jobRequirement']['employmentType']
        #job_languages = job['jobDetail']['jobRequirement']['languages']
        #job_benefits = job['jobDetail']['jobRequirement']['benefits']
        job_apply_url = job['applyUrl']['url']
        #job_location_zipcode = job['location'][0]['locationId']
        job_location = job['location'][0]['location']
        job_country = job['sourceCountry']

        return [job_title, job_salary_min, job_salary_max, job_salary_currency, company, job_post_date, job_internship, company_overview, job_description, job_requirement_career_level, job_requirement_yearsOfExperience, job_requirement_qualification, job_employment_type, job_apply_url, job_location, job_country]
    else:
        return []

def page_crawler(keyword):
    # input: keyword for job postings
    # output: dataframe of links scraped from each page

    # page number
    page_number = get_page_number(keyword)
    job_links = []

    for n in range(page_number):
        print('Loading page {} ...'.format(n+1))
        url = base_url.format(keyword, n+1)
        driver.get(url)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
    
        #extract all job links
        links = soup.find_all('a',{'class':'_1hr6tkx5 _1hr6tkx7 _1hr6tkxa sx2jih0 sx2jihf zcydq8h'})
        job_links += links
 
    jobs = []

    for link in job_links:
        job_link = link['href'].strip().split('?', 1)[0]
        #Combine the search keyword and job link as the first two columns
        if(len(job_page_scraper(job_link))):
            jobs.append(job_page_scraper(job_link))
    
    #Creates dataframe with jobs as values, and columns as column names
    result_df = pd.DataFrame(jobs, columns = ["job_title", "job_salary_min", "job_salary_max", "job_salary_currency", "company", "job_post_date", "job_internship", "company_overview", "job_description", "job_requirement_career_level", "job_requirement_yearsOfExperience", "job_requirement_qualification", "job_employment_type", "job_apply_url", "job_location", "job_country"])
    return result_df

# def main():

#     # a list of job roles to be crawled
#     key_words = ['frontend ux developer morgan']
#     dfs = []

#     for key in key_words:
#         key_df = page_crawler(key)
#         dfs.append(key_df)

#     # save scraped information as csv
#     pd.concat(dfs).to_csv("job_postings_results.csv")

# if __name__ == '__main__':
#     main()

  driver = Chrome(executable_path=path)


In [89]:
#Request keyword
search_term = input("Enter Search Term: ")

# a list of job roles to be crawled
# key_words = ['frontend ux developer DBS']
key_words = [search_term]
dfs = []

for key in key_words:
    key_df = page_crawler(key)
    dfs.append(key_df)

# save scraped information as csv
#pd.concat(dfs).to_csv("job_postings_results.csv")

Enter Search Term: frontend ux developer DBS
<span class="sx2jih0 zcydq84u es8sxo0 es8sxo1 es8sxo21 _1d0g9qk4 es8sxo7"><strong class="es8sxo3">1-4</strong> of 6 jobs </span>
['1-4', 'of', '6', 'jobs']
Loading page 1 ...
scraping... https://www.jobstreet.com.sg/en/job/vp-avp-reactjs-developer-branch-and-self-serviced-banking-consumer-banking-group-technology-technology-operations-wd39175-jobsthatmatter-10327510
scraping... https://www.jobstreet.com.sg/en/job/vp-avp-reactjs-developer-branch-and-self-serviced-banking-consumer-banking-group-technology-technology-operations-wd39175-jobsthatmatter-10327510
scraping... https://www.jobstreet.com.sg/en/job-search/frontend-ux-developer-dbs-jobs/
scraping... https://www.jobstreet.com.sg/en/job-search/frontend-ux-developer-dbs-jobs/
scraping... https://www.jobstreet.com.sg/en/job-search/frontend-ux-developer-dbs-jobs/
scraping... https://www.jobstreet.com.sg/en/job/vp-avp-reactjs-developer-branch-and-self-serviced-banking-consumer-banking-group-te

In [86]:
key_df

Unnamed: 0,job_title,job_salary_min,job_salary_max,job_salary_currency,company,job_post_date,job_internship,company_overview,job_description,job_requirement_career_level,job_requirement_yearsOfExperience,job_requirement_qualification,job_employment_type,job_apply_url,job_location,job_country
0,"VP/AVP, ReactJS Developer, Branch and Self-ser...",,,SGD,DBS Bank Limited,10-Jan-23,False,DBS is a leading financial services group in A...,"Business FunctionAs the leading bank in Asia, ...",Senior Manager,8 years,"Bachelor's Degree, Post Graduate Diploma, Prof...",Full-Time,https://careers.dbs.com/careersection/dbs_prof...,Singapore,sg
1,"VP/AVP, - ReactJS Developer, Branch and Self-s...",,,SGD,DBS Bank Limited,10-Jan-23,False,DBS is a leading financial services group in A...,"Business FunctionAs the leading bank in Asia, ...",Senior Manager,8 years,"Bachelor's Degree, Post Graduate Diploma, Prof...",Full-Time,https://careers.dbs.com/careersection/dbs_prof...,Singapore,sg
2,Full Stack Developer,,,SGD,Tow Me Sg Pte. Ltd.,17 hours ago,False,,Full Stack Developer Job DescriptionAs the ful...,Not Specified,,Not Specified,Full-Time,https://sg.jobsdb.com/job/rd/8216d70245e46abea...,Singapore,sg
3,Copywriter,,,SGD,DDB Worldwide Pte Ltd,14-Jan-23,False,About DDB Group Singapore – DDB Group is one o...,About DDB Group Singapore – DDB Group is one o...,Not Specified,,Not Specified,Full-Time,https://sg.jobsdb.com/job/rd/f9cba4fc20187b8cc...,Kallang,sg


## Convert to JSON

In [68]:
#Define Dictionary

array = []
#Columns
for x in range(len(key_df.columns)):
    #Rows
    for y in range(len(key_df.index)):
        pikachu = {
            for z in range(len(key_df.columns)):
                str(key_df.columns[z]): key_df.iloc[y][z]
        }
        array.append[pikachu]

SyntaxError: invalid syntax (Temp/ipykernel_3144/2617395563.py, line 9)

In [66]:
key_df.iloc[0][0]

'VP/AVP, - ReactJS Developer, Branch and Self-serviced Banking, Consumer Banking Group Technology, Technology & Operations - (WD39175) #JobsThatMatter.'

In [67]:
key_df.columns[0]

'job_title'

In [59]:
len(key_df.index)

4

In [58]:
X = key_df.to_json('dataframe.json')["job1"]

In [74]:
X = {}

X["job"] = key_df.to_json('dataframe.json')

job1: {
    jobtitle:
    
}
job2: {
    
}

In [95]:
import pprint

X = {}
X = key_df.to_dict(orient='records')
pprint.pprint(X)

[{'company': 'DBS Bank Limited',
  'company_overview': 'DBS is a leading financial services group in Asia, with '
                      'over 280 branches across 18 markets. Headquartered and '
                      'listed in Singapore, DBS has a growing presence in the '
                      'three key Asian axes of growth: Greater China, '
                      'Southeast Asia and South Asia. The bank\'s "AA-" and '
                      '"Aa1" credit ratings are among the highest in the '
                      'world.DBS is at the forefront of leveraging digital '
                      'technology to shape the future of banking, and has been '
                      'named “World’s Best Digital Bank” by Euromoney in 2016 '
                      'and 2018. The bank has also been recognised for its '
                      'leadership in the region, having been named “Asia’s '
                      'Best Bank” by several publications including The '
                      'Banker, Glob

In [75]:
print(X)

{'job': None}


In [71]:
Y = json.load(X)
print(Y.job_title)

AttributeError: 'NoneType' object has no attribute 'read'