## Pulling

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import requests
import hashlib 
import time

In [35]:
df_preds = pd.read_csv('Data/Processed/MCF_Training_Set.csv').drop_duplicates()

In [36]:
df_preds['uuid'] = [hashlib.md5(job_id.encode()).hexdigest() for job_id in df_preds['MCF_Job_Ad_ID']]

In [37]:
len(df_preds)

53888

In [43]:
base_url = 'https://api.mycareersfuture.gov.sg/v2/jobs'

output = []
for i, uuid in enumerate(df_preds['uuid']):
    if i < 33653:
        continue
    req = requests.get(base_url + "/" + uuid)
    if (i+1) % 10 == 0:
        print(f"Completed {i+1}...\r", end = '')
        time.sleep(.1)
    if req.status_code == 200:
        output.append(req.json())
    else:
        print('Backing off for 2 seconds...')
        time.sleep(2)
        req = requests.get(base_url + "/" + uuid)
        output.append(req.json())

Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Completed 53880...

In [44]:
len(output)

20235

In [40]:
import copy
saved = copy.deepcopy(output)

In [45]:
len(saved)

33654

In [49]:
final = saved + output[1:]

In [73]:
final[0]

{'uuid': '539df02ed701baaa6d20b5e8fb82b9cc',
 'sourceCode': 'Employer Portal',
 'title': 'Sales director',
 'description': '<h3>Responsibilities:</h3>\n<h3>· Lead the management team and being responsible for the APAC partnerships</h3>\n<h3>· Growing the revenue through orchestrating execution and strategic planning across different channels of clients</h3>\n<h3>· Ability to leverage across Business to Business clients and targeting the cross functionally – offering exemplary customer service support for the food safety clients across the region</h3>\n<h3>· Growth of the services and education portfolio through key customers</h3>\n<h3>Requirements:</h3>\n<h3>· Minimum 10 years in the food industry</h3>\n<h3>· Track record of success across business development and marketing across the region</h3>\n<h3>· Must have excellent communication skills as well as sensitivity towards all markets</h3>\n<h3>· Strong negotiations and ability to strategise and always see further opportunities across

In [51]:
def extract_mcf_data(json):
    
    output = {}
    transfer = ['uuid', 'title', 'description', 'minimumYearsExperience', 'numberOfVacancies']
    for key in transfer:
        output[key] = json[key]

    # Extract skills
    output['skills'] = ', '.join([entry['skill'] for entry in json['skills']])
    
    # Extract hiring company
    company = ['name', 'description', 'ssicCode', 'employeeCount']
    if json['metadata']['isPostedOnBehalf']:
        company_col = 'hiringCompany'
    else:
        company_col = 'postedCompany'
    for key in company:
        output['company_' + key] = json[company_col][key]
        
    # Extract metadata
    metadata = ['originalPostingDate', 'newPostingDate', 'expiryDate', 'totalNumberOfView', 'totalNumberJobApplication']
    for key in metadata:
        output[key] = json['metadata'][key]

    
    # Extract salary
    salary = ['maximum', 'minimum']
    for key in salary:
        output['salary_' + key] = json['salary'][key]
    
    return output

In [68]:
mcf_data = df_preds.merge(pd.DataFrame([extract_mcf_data(entry) for entry in final]), how = 'left', on = 'uuid')


In [71]:
mcf_final = mcf_data[mcf_data['title'].notnull()].drop_duplicates('description')

In [46]:
mcf_final.to_csv('Data/Processed/MCF_Training_Set_Full.csv', index = False)

In [74]:
mcf_final.shape

(42844, 19)

In [1]:
import os
os.chdir('C:\\Users\\shaun\\PycharmProjects\\ssoc-autocoder')
import pandas as pd

In [2]:
mcf_final = pd.read_csv('Data/Processed/MCF_Training_Set_Full.csv', low_memory = False)
output_df = mcf_final[['description', 'Predicted_SSOC_2020']]
output_df.columns = ['Description', 'SSOC 2020']

In [45]:
mcf_final

Unnamed: 0,MCF_Job_Ad_ID,Predicted_SSOC_2020,uuid,title,description,minimumYearsExperience,numberOfVacancies,skills,company_name,company_description,company_ssicCode,company_employeeCount,originalPostingDate,newPostingDate,expiryDate,totalNumberOfView,totalNumberJobApplication,salary_maximum,salary_minimum
0,MCF-2020-0004327,11202,539df02ed701baaa6d20b5e8fb82b9cc,Sales director,<h3>Responsibilities:</h3>\n<h3>· Lead the man...,10,1,"Account Management, Strategic Planning, Sales,...",ARGYLL SCOTT CONSULTING PTE. LTD.,,70201.0,,2020-01-07,2020-01-07,2020-01-21,53,0,15000,12000
1,MCF-2020-0005420,21499,025e20c4553746731c77858dde113dcb,E&I INSPECTOR (COMPEX),<h3>· Involved in Inspection and Commissioning...,5,5,"Document Management, Operation, Engineering Ma...",LISOON INDUSTRIAL ENGINEERING PTE. LTD.,<p>LISOON INDUSTRIAL ENGINEERING PTE. LTD.</p>...,30110.0,,2020-01-08,2020-01-08,2020-02-07,86,0,3200,2500
2,MCF-2020-0033506,14201,65d172acd8ba990726a1c725aa554008,STORE MANAGER,"<p><a href=""https://www.snagajob.com/job-searc...",5,5,"Retail, Plan, Sales, Search, Schedule, Visual ...",SKY CAPITAL HOLDINGS PTE. LTD.,,10750.0,,2020-02-11,2020-02-11,2020-03-12,78,0,6000,4000
3,MCF-2020-0063729,22692,62333158febf9ab5660c7c3f0cb45d12,Associate Chiropractor Wanted!,<h3><strong>WE ARE LOOKING FOR:</strong>​</h3>...,1,2,"Communication, Health Education, Holistic Heal...",ORGANIC FAMILY CHIROPRACTIC PTE. LTD.,<p>ORGANIC FAMILY CHIROPRACTIC PTE. LTD.</p>\r\n,86909.0,,2020-03-15,2020-07-21,2020-08-20,196,0,6000,4000
4,MCF-2020-0073711,41109,e4863be8c042dbb8eda4af601b7338bf,Admin,<p>1) Creating &amp; maintaining filing system...,1,1,"Microsoft Office, Microsoft Excel, maintaining...",BINJAI INTERIOR DESIGN PTE LTD,<p>\r\n\tBINJAI INTERIOR DESIGN PTE LTD</p>\r\n,31001.0,,2020-03-30,2020-07-06,2020-08-05,471,0,2400,1800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42839,MCF-2021-0247058,41109,808271179f2b41f200ed9bd549db79bb,"Talent Acquisition Associate, Group HR, 6 Mont...",<p><strong>Roles &amp; Responsibilities</str...,0,1,"Communication, Onboarding, telecommunation, Ta...",SINGAPORE TELECOMMUNICATIONS LIMITED,<p>The SingTel Group is Asia's leading communi...,61099.0,,2021-05-31,2021-07-15,2021-08-14,1219,124,3500,2200
42840,MCF-2021-0247063,21499,6c33746d5f4434ccfee82df9d0b4cde9,#SGUnitedJobs Senior Research Engineer (Proces...,<p>A qualified candidate will be self-motivate...,3,1,"Cleaning, Plan, autonomous, Development, Analy...",A*STAR RESEARCH ENTITIES,,72104.0,,2021-05-31,2021-05-31,2021-06-30,114,2,10200,5900
42841,MCF-2021-0247083,21441,c76e644a0c5cd0e6ee978b632d48f773,Senior Cold chain system Design Engineer,<p><strong>Purpose of the position</strong><br...,10,1,"project specification, Plan, Product Design, d...",LUCKLAND ASIA PTE. LTD.,,74191.0,,2021-05-31,2021-05-31,2021-06-30,96,2,8500,7500
42842,MCF-2021-0247098,26541,47456be3179841f60c7dfd33038f1579,Video Production Editor #SGUnitedJobs,<h1>What we're looking for</h1>\n<p>Economics ...,5,1,"Editorial, designed, Documentaries, Adobe Prem...",ECONOMICS DESIGN PRIVATE LIMITED,<p>Economics design focuses on the design of e...,62021.0,,2021-05-31,2021-05-31,2021-06-30,31,0,2000,1800


In [47]:
data = mcf_final[~mcf_final['Predicted_SSOC_2020'].fillna('X').str.contains('X')].reset_index(drop = True)
data.to_csv('Data/Processed/MCF_Training_Set_Full.csv', index = False)

In [17]:
import hashlib
import requests

In [44]:
job_id = data.loc[22952]['MCF_Job_Ad_ID']
job_id_hash = hashlib.md5(job_id.encode()).hexdigest()
base_url = 'https://api.mycareersfuture.gov.sg/v2/jobs'
req = requests.get(base_url + "/" + job_id_hash)
print(req.json()['title'])
print(req.json()['ssocCode'])
print(req.json()['description'])

Head of Revenue
14752
<p><strong>ABOUT US</strong></p>
<p>RedDoorz is a Singapore-based travel tech company with offices in Indonesia, Philippines, Vietnam, India and Singapore. Our team have a diverse background - over 7 nationalities and previous work experiences from top companies such as Grab, Apple, Intercontinental, Lazada and Tata.</p>
<p>We are <strong>transforming the hotel industry</strong> through the latest innovation in hospitality after the initial waves of Online Travel Agents and Airbnb. <strong>Unlike traditional hotel chains like Holiday Inn or Marriott</strong>, we don't build physical hotels but partner with hundreds of owners across 50+ cities, therefore being asset light and scalable. <strong>Unlike Online Travel Agents like Booking.com or Expedia</strong>, all our hotels under us are branded as RedDoorz - therefore we can drive quality and predictable stays to our customers. </p>
<p><strong>Like Uber or Grab</strong>, we are the technology behind our hotel partne

In [41]:
req.json()

{'uuid': '1b48cc9e49767b07bc2a60e96ca332bf',
 'sourceCode': 'Employer Portal',
 'title': 'On Site Coordinator ',
 'description': '<p><strong>Responsibilities:</strong></p>\n<ul>\n  <li>Manage and control all ingress/egress in the Defined Area.</li>\n  <li>Daily updating of track access/book-ins and book-outs</li>\n  <li>Implement Workplace Safety and regulate the conduct of Works Train Station Supervisor.</li>\n  <li>Ensure routes are set and secured before authorising consist of movements.</li>\n  <li>Other duties as assigned.</li>\n</ul>\n<p><strong>Requirements:</strong></p>\n<ul>\n  <li>Local Diploma in Engineering Discipline</li>\n  <li>Min 5 years relevant past working experiences</li>\n  <li>Possess a relevant Class 3 or 4 local driving licence.</li>\n  <li>Knowledge of Microsoft Office application and able to provide weekly reports to the Work Train Manager.</li>\n  <li>To work independently and perform overtime work on a regular basis.</li>\n  <li>To conduct safety briefings r

In [34]:
req.json()

{'uuid': '434ddd4eb00de81a9b6dabd968f36078',
 'sourceCode': 'Employer Portal',
 'title': 'Senior Staff Module Development Engineer ',
 'description': '<p>Maxeon is seeking a world-class <strong>Senior Staff Module Development Engineer </strong>based in Singapore. It is expected that most of this work will be carried out at a new state-of-the-art Maxeon RD&amp;D Laboratory to be created in Singapore. Job responsibilities include research and development of new solar technology that pushes the limit of efficiency at lower cost, working closely with Maxeon’s renowned US based R&amp;D team, and supporting existing teams in deploying and optimizing solar technology at Maxeon’s factories. This work is expected to include both cell and module technology development for Maxeon’s advanced product portfolio. The goal of these activities is to support Maxeon’s business objectives, including market expansion, with targeted advanced technology.</p>\n<p><strong>ESSENTIAL DUTIES AND RESPONSIBILITIES<

In [4]:
from ssoc_autocoder import processing

In [7]:
cleaned = []
for i, desc in enumerate(output_df['Description']):
    if (i+1) % 100 == 0:
        print(f'Processing {i+1}/{len(output_df)}...\r', end = '')
    try:
        cleaned_desc = processing.process_text(desc)
    except KeyboardInterrupt:
        break 
    except:
        print(f'Error found in index {i}...')
        cleaned_desc = processing.final_cleaning(desc)
    cleaned.append(cleaned_desc)

Error found in index 3071...
Error found in index 3688...
Error found in index 7005...
Error found in index 18283...
Error found in index 19336...
Error found in index 25188...
Error found in index 27221...
Error found in index 34863...
Error found in index 38110...
Error found in index 40663...
Error found in index 41018...
Processing 42800/42844...

In [8]:
output_df['Cleaned_Description'] = cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['Cleaned_Description'] = cleaned


In [9]:
output_df.to_csv('Data/Processed/training/train_full.csv', index = False)

In [37]:
processing.final_cleaning(output_df['Description'][41018])

"1 Data entry duties and administrative duties include: Preparing quotation, final bill of quantity and sales invoices for the project you are assigned to. 2 Support project manager/ M&E coordinator. 3 Handling emails related to the project. 4 Ability to be on time and complete task in an efficient manner. 5 Manage databases and admin systems. 6 Organizing, storing paperwork, documents and computer-based documents. 7 Assisting in other office ad-ho duties when required. Skills Requirements. 1 Neat administrative and good organisational skills. 2 Able to thrive and work individually and independently. 3 Experienced and proficient in Excel. 4 Communication in English. 5 Possess ability to work on own initiative. 6 Fast learner. 7 Candidate must possess at least N's Level and above. Other information. 1 5.5 working days (alternate Saturdays off) (Mon - Fri: 8:30am to 5:30pm, Sat: 8:30am to 1pm) . 2 Location is in Kaki Bukit. 3 Full time job. 4 Work life balance. Kindly state your expected

In [38]:
output_df['Description'][41018]

'<p>Responsibilities\t\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>1\tData entry duties and administrative duties include: Preparing quotation, final bill of quantity and sales invoices for the project you are assigned to\t\t\t\t\t\t\t</p>\n<p>2\tSupport project manager/ M&amp;E coordinator\t\t\t\t\t\t\t</p>\n<p>3\tHandling emails related to the project\t\t\t\t\t\t\t</p>\n<p>4\tAbility to be on time and complete task in an efficient manner\t\t\t\t\t\t\t</p>\n<p>5\tManage databases and admin systems\t\t\t\t\t\t\t</p>\n<p>6\tOrganizing, storing paperwork, documents and computer-based documents\t\t\t\t\t\t\t</p>\n<p>7\tAssisting in other office ad-ho duties when required\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>Skills Requirements\t\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>1\tNeat administrative and good organisational skills\t\t\t\t\t\t\t</p>\n<p>2\tAble to thrive and work individually and independently\t\t\t\t\t\t\t</p>\n<p>3\tExperienced and proficient in Excel\t\t\t\t\t