In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By

In [2]:
def test_eight_components():
    driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

    driver.get("https://www.selenium.dev/selenium/web/web-form.html")

    title = driver.title
    assert title == "Web form"

    driver.implicitly_wait(0.5)

    text_box = driver.find_element(by=By.NAME, value="my-text")
    submit_button = driver.find_element(by=By.CSS_SELECTOR, value="button")

    text_box.send_keys("Selenium")
    submit_button.click()

    message = driver.find_element(by=By.ID, value="message")
    value = message.text
    assert value == "Received!"

    driver.quit()

In [3]:
url = "https://www.mycareersfuture.gov.sg/"

In [4]:
search_term = "analyst"
url_search_term = url + f"search?search={search_term}&sortBy=relevancy&page=0"

In [5]:
url_search_term

'https://www.mycareersfuture.gov.sg/search?search=analyst&sortBy=relevancy&page=0'

In [6]:
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))

driver.get(url_search_term)

In [7]:
# Need to fetch all of the links in the card-list

In [8]:
test_links = driver.find_elements(by=By.TAG_NAME, value="a")

In [9]:
links = []

for link in test_links:
    if link.get_attribute('href'):
        if 'job' in link.get_attribute('href'):
            links.append(link.get_attribute('href'))

In [10]:
links

['https://www.mycareersfuture.gov.sg/job/environment/business-analyst-executive-spearing-executive-search-62c43a150cac4298d26077b285b70ebf?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/banking-finance/business-analyst-encora-technologies-e2dd0ab746d5a4140904d3c056cdbee9?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/consulting/business-analyst-sp-digital-e99e44620aa4beed5f7d15b971656d5f?source=MCF&event=SuggestedJob',
 'https://www.mycareersfuture.gov.sg/job/healthcare/assistant-lead-analyst-morgan-mckinley-3e43b4bd6c3f70535481e98d17625c48?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/banking-finance/analyst-brookfield-singapore-95c4bd5987abcc5a0a4fdcf699b641d2?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/banking-finance/analyst-polymer-capital-singapore-ac84eceba7fd514a89b1639e8c3be4cc?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/information-technology/french-speaking-senior-busin

In [11]:
# Inside an individual page

In [12]:
driver.get(links[0])

In [13]:
job_details = {}

In [14]:
company = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@data-cy='company-hire-info__company']")
position = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//h1[@id='job_title']")
address = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='address']")
employment_type = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='employment_type']")
seniority = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='seniority']")
min_experience = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='min_experience']")
job_category = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='job-categories']")
salary = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@data-cy='salary-range']")
num_applications = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='num_of_applications']")
last_posted_date = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='last_posted_date']")
expiry_date = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='expiry_date']")

In [15]:
company.text

'SPEARING EXECUTIVE SEARCH PTE. LTD.'

In [16]:
position.text

'Business Analyst Executive'

In [17]:
address.text

'FUNAN, 109 NORTH BRIDGE ROAD 179097'

In [18]:
employment_type.text

'Full Time'

In [19]:
seniority.text

'Executive'

In [20]:
min_experience.text

'3 years exp'

In [21]:
job_category.text

'Environment / Health, Healthcare / Pharmaceutical, Professional Services, Risk Management'

In [22]:
salary.text

'$4,000to$5,500'

In [23]:
num_applications.text

'0 application'

In [24]:
last_posted_date.text

'Posted 26 Jan 2023'

In [25]:
expiry_date.text

'Closing on 25 Feb 2023'

This section will deal with capturing the main bulk of unstructured text.

This will need to capture the Job Requirements - what is needed from the candidate

Versus Job Description - what the job entails.

Unfortunately there are many things getting in the way of smooth scraping namely
- no fixed format
- Additional sections like good to haves
  - https://www.mycareersfuture.gov.sg/job/accounting/corporate-finance-analyst-green-li-ion-d0f504c7c8e67ca416caacea94dca691?source=MCF&event=Search
- Unconventional formating
  - https://www.mycareersfuture.gov.sg/job/banking-finance/business-analyst-rapsys-technologies-a85f3d76a3a9fbbd4ff911af99d4d081?source=MCF&event=Search


Ideal format are:

https://www.mycareersfuture.gov.sg/job/banking-finance/business-analyst-nicoll-curtin-technology-df4fdf2b951da4275a24fae5cfe5acd8?source=MCF&event=Search

https://www.mycareersfuture.gov.sg/job/accounting/financial-analyst-starhub-92a414a8c360bd70dacc17e6587dafb8?source=MCF&event=Search

My idea is to capture the text line by line separated either by paragraph, bullet points or by breaks (inside one big paragraph). When a keyword like 'responsibility' or 'JD' or 'Requirements' is found, it will be be a key and then the rest of the content will be the value until the next keyword is found.

I will save this processing for later

In [26]:
JD = driver.find_element(by=By.XPATH, value="//div[@id='description-content']")

In [27]:
JD.text

'Position Overview\nThe Business Analyst provides analytical support for the development of technology solutions that meet business needs. The candidate assists senior team members with projects, partners with stakeholders to identify, clarify, and document complex issues they are facing. The candidate assists with the assessment of the viability and effectiveness of technical systems and business processes can in meeting these needs as well as facilitates communication between business and technology teams. The candidate has a passion for learning and is comfortable exploring and analysing data.\n\nRole & Responsibilities\nIdentify business needs\nLiaise with internal and/or external stakeholders to identify system requirements\nSupport user requirement analysis\nTranslate system requirements into project briefs\nSupport the preparation of proposals for modified or replacement systems\nAnalyse systems\nSupport the analysis of existing systems through information collection\nDocument s

In [28]:
def get_jobpost_details(link:str)->dict:
    """returns all of the job details in a posting in a dictionary

    Args:
        link (str): link of the job posting
    """
    driver.get(link)
    company = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@data-cy='company-hire-info__company']")
    position = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//h1[@id='job_title']")
    address = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='address']")
    employment_type = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='employment_type']")
    seniority = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='seniority']")
    min_experience = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='min_experience']")
    job_category = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//p[@id='job-categories']")
    salary = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@data-cy='salary-range']")
    num_applications = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='num_of_applications']")
    last_posted_date = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='last_posted_date']")
    expiry_date = driver.find_element(by=By.XPATH, value="//div[@data-cy='JobDetails__job-info']//span[@id='expiry_date']")
    JD = driver.find_element(by=By.XPATH, value="//div[@id='description-content']")

    res = {
        'company':company.text,
        'position': position.text,
        'address': address.text,
        'employment_type' : employment_type.text,
        'seniority' : seniority.text,
        'min_experience' : min_experience.text,
        'job_category' : job_category.text,
        'salary' : salary.text,
        'num_applications' : num_applications.text,
        'last_posted_date': last_posted_date.text,
        'expiry_date' : expiry_date.text,
        'JD' : JD.text
    }

    return res

In [29]:
# Go to next link

In [30]:
test_function_res = get_jobpost_details(links[1])

In [31]:
test_function_res

{'company': 'ENCORA TECHNOLOGIES PTE. LTD.',
 'position': 'Business Analyst',
 'address': 'Islandwide',
 'employment_type': 'Contract',
 'seniority': 'Senior Executive',
 'min_experience': '8 years exp',
 'job_category': 'Banking and Finance',
 'salary': '$7,000to$10,000',
 'num_applications': '1 application',
 'last_posted_date': 'Posted 26 Jan 2023',
 'expiry_date': 'Closing on 09 Feb 2023',
 'JD': 'Business Systems Analyst is responsible for managing significant projects (e.g., strategic change management, new business and product initiatives, process re-engineering, etc.), and includes business systems analysis or technical writing. Activities require a broad knowledge of the organization and its key functions. The Business Systems Analysis focus specializes in acting as the primary interface between Technology and specific business areas, identifying business unit requirements, creating project and process specifications, coordinating with project teams, and ensuring adherence to 

In [32]:
def get_links(link:str)->list:
    """returns job links

    Args:
        link (str): url of the page
    """
    driver.get(link)
    retrieved_links = driver.find_elements(by=By.TAG_NAME, value="a")

    links = []

    for link in retrieved_links:
        if link.get_attribute('href'):
            if 'job' in link.get_attribute('href'):
                links.append(link.get_attribute('href'))

    return links

In [33]:
url_search_term

'https://www.mycareersfuture.gov.sg/search?search=analyst&sortBy=relevancy&page=0'

In [34]:
next_page = url_search_term[:-1]+str(int(url_search_term[-1])+1)

In [35]:
next_page

'https://www.mycareersfuture.gov.sg/search?search=analyst&sortBy=relevancy&page=1'

In [36]:
links_next_page = get_links(next_page)

In [37]:
links_next_page

['https://www.mycareersfuture.gov.sg/job/manufacturing/data-analyst-shell-eastern-petroleum-8a6f3530d09ce46da39372f15a7aa0fa?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/banking-finance/functional-business-analyst-shell-infotech-034b511de2a840b184b13afd4cf20f56?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/information-technology/sap-analyst-fcs-careers-9ae024bba627f64c551e13824f164ebf?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/information-technology/system-analyst-randstad-2c2aa3f068ebffe61d7bb10564e156ff?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/repair-maintenance/system-analyst-sap-west-recruitpedia-554ca620df56f33da4e6b5e7c60f88e7?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/information-technology/analyst-programmer-collabera-technologies-bade631262feaba2b0ef9aef3aeb582d?source=MCF&event=Search',
 'https://www.mycareersfuture.gov.sg/job/banking-finance/analyst-portcullis

In [38]:
# Scrape till no links available
invalid_page = url_search_term[:-1]+str(int(url_search_term[-1])+9999)

In [39]:
links_invalid_page = get_links(invalid_page)

In [40]:
links_invalid_page

[]

In [41]:
a_dict = {'a':'a'}
b_dict = {'b':'b'}
c_dict = {'c':'c'}
d_dict = {'d':'d'}
test_list = [a_dict, b_dict]
second_list = [c_dict, d_dict]

In [42]:
test_list + second_list

[{'a': 'a'}, {'b': 'b'}, {'c': 'c'}, {'d': 'd'}]

In [52]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [67]:
from scraper import Scraper

In [68]:
scraper = Scraper(search_term='analyst')

In [69]:
print(scraper)

Chrome Driver with search term: analyst


In [70]:
scraper._test_scrape()

2023-01-26:14:22:10,179 DEBUG    [remote_connection.py:303] POST http://localhost:63921/session/97882d2dd7e02d5448c368581bd17d29/timeouts {"implicit": 2000}
2023-01-26:14:22:10,182 DEBUG    [connectionpool.py:456] http://localhost:63921 "POST /session/97882d2dd7e02d5448c368581bd17d29/timeouts HTTP/1.1" 200 14
2023-01-26:14:22:10,183 DEBUG    [remote_connection.py:319] Remote response: status=200 | data={"value":null} | headers=HTTPHeaderDict({'Content-Length': '14', 'Content-Type': 'application/json; charset=utf-8', 'cache-control': 'no-cache'})
2023-01-26:14:22:10,184 DEBUG    [remote_connection.py:347] Finished Request
2023-01-26:14:22:10,185 DEBUG    [remote_connection.py:303] POST http://localhost:63921/session/97882d2dd7e02d5448c368581bd17d29/url {"url": "https://www.mycareersfuture.gov.sg/search?search=analyst&sortBy=relevancy&page=0"}
2023-01-26:14:22:12,258 DEBUG    [connectionpool.py:456] http://localhost:63921 "POST /session/97882d2dd7e02d5448c368581bd17d29/url HTTP/1.1" 200 

[]