In [2]:
import pandas as pd
import requests
import hashlib 
import time

In [11]:
mcf_ids_raw = {'Job Ad Id': ['MCF-2021-0334514', 'MCF-2021-0300571']}
mcf_ids = pd.DataFrame(mcf_ids_raw)

In [12]:
mcf_ids['uuid'] = [hashlib.md5(job_id.encode()).hexdigest() for job_id in mcf_ids['Job Ad Id']]

In [14]:
mcf_ids.head()

Unnamed: 0,Job Ad Id,uuid
0,MCF-2021-0334514,20c875b6cfacd68fe3667f5ab6b15c5c
1,MCF-2021-0300571,dd4ceadcc7174a3be99bcebef566fbc9


In [15]:
base_url = 'https://api.mycareersfuture.gov.sg/v2/jobs'

output = []
for uuid in mcf_ids['uuid']:
    req = requests.get(base_url + "/" + uuid)
    if req.status_code == 200:
        output.append(req.json())
        time.sleep(.1)
    else:
        print('Backing off...')
        time.sleep(2)
        req = requests.get(base_url + "/" + uuid)
        output.append(req.json())

In [17]:
output[0]['skills']

[{'id': 6782,
  'skill': 'PL/SQL',
  'uuid': '1f93cbc3d076ca8436b48d4d75fb0939',
  'confidence': None,
  '_links': {'self': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/1f93cbc3d076ca8436b48d4d75fb0939'},
   'jobs': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/1f93cbc3d076ca8436b48d4d75fb0939/jobs'}}},
 {'id': 100013,
  'skill': 'designed',
  'uuid': '3677b55b4fb1c656b2a6ec776093be88',
  'confidence': None,
  '_links': {'self': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/3677b55b4fb1c656b2a6ec776093be88'},
   'jobs': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/3677b55b4fb1c656b2a6ec776093be88/jobs'}}},
 {'id': 4720,
  'skill': 'Issue Management',
  'uuid': '3fa25db427db361eee13817c6f4c7726',
  'confidence': None,
  '_links': {'self': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/3fa25db427db361eee13817c6f4c7726'},
   'jobs': {'href': 'https://api.mycareersfuture.gov.sg/v2/skills/3fa25db427db361eee13817c6f4c7726/jobs'}}},
 {'id': 106862

In [20]:
def extract_mcf_data(json):
    
    output = {}
    transfer = ['uuid', 'title', 'description', 'minimumYearsExperience', 'numberOfVacancies']
    for key in transfer:
        output[key] = json[key]

    # Extract skills
    output['skills'] = ', '.join([entry['skill'] for entry in json['skills']])
    
    # Extract hiring company
    company = ['name', 'description', 'ssicCode', 'employeeCount']
    if json['metadata']['isPostedOnBehalf']:
        company_col = 'hiringCompany'
    else:
        company_col = 'postedCompany'
    for key in company:
        output['company_' + key] = json[company_col][key]
        
    # Extract metadata
    metadata = ['originalPostingDate', 'newPostingDate', 'expiryDate', 'totalNumberOfView', 'totalNumberJobApplication']
    for key in metadata:
        output[key] = json['metadata'][key]

    
    # Extract salary
    salary = ['maximum', 'minimum']
    for key in salary:
        output['salary_' + key] = json['salary'][key]
    
    return output

In [21]:
mcf_data = mcf_ids.merge(pd.DataFrame([extract_mcf_data(entry) for entry in output]), how = 'left', on = 'uuid')
mcf_final = mcf_data.drop_duplicates()

In [22]:
mcf_final.head()

Unnamed: 0,Job Ad Id,uuid,title,description,minimumYearsExperience,numberOfVacancies,skills,company_name,company_description,company_ssicCode,company_employeeCount,originalPostingDate,newPostingDate,expiryDate,totalNumberOfView,totalNumberJobApplication,salary_maximum,salary_minimum
0,MCF-2021-0334514,20c875b6cfacd68fe3667f5ab6b15c5c,Production Support Lead Analyst,<p><strong>Role Description</strong></p>\n<p...,5,1,"PL/SQL, designed, Issue Management, continuous...",FNZ (SINGAPORE) SERVICES PTE. LTD.,,66199,,2021-07-13,2021-07-13,2021-08-12,292,20,10000,7500
1,MCF-2021-0300571,dd4ceadcc7174a3be99bcebef566fbc9,Unit Manager,<p><strong>Job Description</strong></p>\n<p><b...,10,1,"Negotiation, Coaching, business understanding,...",SUZETTE INTERNATIONAL PTE. LTD.,"<p>Henri Charpentier, famous popular Japanese ...",47213,25.0,2021-06-25,2021-06-25,2021-07-25,354,39,9700,9400
