## Pulling

In [1]:
import os
os.chdir('..')

In [2]:
import pandas as pd
import requests
import hashlib 
import time

In [35]:
df_preds = pd.read_csv('Data/Processed/MCF_Training_Set.csv').drop_duplicates()

In [36]:
df_preds['uuid'] = [hashlib.md5(job_id.encode()).hexdigest() for job_id in df_preds['MCF_Job_Ad_ID']]

In [37]:
len(df_preds)

53888

In [43]:
base_url = 'https://api.mycareersfuture.gov.sg/v2/jobs'

output = []
for i, uuid in enumerate(df_preds['uuid']):
    if i < 33653:
        continue
    req = requests.get(base_url + "/" + uuid)
    if (i+1) % 10 == 0:
        print(f"Completed {i+1}...\r", end = '')
        time.sleep(.1)
    if req.status_code == 200:
        output.append(req.json())
    else:
        print('Backing off for 2 seconds...')
        time.sleep(2)
        req = requests.get(base_url + "/" + uuid)
        output.append(req.json())

Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Backing off for 2 seconds...
Completed 53880...

In [44]:
len(output)

20235

In [40]:
import copy
saved = copy.deepcopy(output)

In [45]:
len(saved)

33654

In [49]:
final = saved + output[1:]

In [73]:
final[0]

{'uuid': '539df02ed701baaa6d20b5e8fb82b9cc',
 'sourceCode': 'Employer Portal',
 'title': 'Sales director',
 'description': '<h3>Responsibilities:</h3>\n<h3>· Lead the management team and being responsible for the APAC partnerships</h3>\n<h3>· Growing the revenue through orchestrating execution and strategic planning across different channels of clients</h3>\n<h3>· Ability to leverage across Business to Business clients and targeting the cross functionally – offering exemplary customer service support for the food safety clients across the region</h3>\n<h3>· Growth of the services and education portfolio through key customers</h3>\n<h3>Requirements:</h3>\n<h3>· Minimum 10 years in the food industry</h3>\n<h3>· Track record of success across business development and marketing across the region</h3>\n<h3>· Must have excellent communication skills as well as sensitivity towards all markets</h3>\n<h3>· Strong negotiations and ability to strategise and always see further opportunities across

In [51]:
def extract_mcf_data(json):
    
    output = {}
    transfer = ['uuid', 'title', 'description', 'minimumYearsExperience', 'numberOfVacancies']
    for key in transfer:
        output[key] = json[key]

    # Extract skills
    output['skills'] = ', '.join([entry['skill'] for entry in json['skills']])
    
    # Extract hiring company
    company = ['name', 'description', 'ssicCode', 'employeeCount']
    if json['metadata']['isPostedOnBehalf']:
        company_col = 'hiringCompany'
    else:
        company_col = 'postedCompany'
    for key in company:
        output['company_' + key] = json[company_col][key]
        
    # Extract metadata
    metadata = ['originalPostingDate', 'newPostingDate', 'expiryDate', 'totalNumberOfView', 'totalNumberJobApplication']
    for key in metadata:
        output[key] = json['metadata'][key]

    
    # Extract salary
    salary = ['maximum', 'minimum']
    for key in salary:
        output['salary_' + key] = json['salary'][key]
    
    return output

In [68]:
mcf_data = df_preds.merge(pd.DataFrame([extract_mcf_data(entry) for entry in final]), how = 'left', on = 'uuid')


In [71]:
mcf_final = mcf_data[mcf_data['title'].notnull()].drop_duplicates('description')

In [75]:
mcf_final.to_csv('Data/Processed/MCF_Training_Set_Full.csv', index = False)

In [74]:
mcf_final.shape

(42844, 19)

In [2]:
import os
os.chdir('C:\\Users\\shaun\\PycharmProjects\\ssoc-autocoder')
import pandas as pd

In [3]:
mcf_final = pd.read_csv('Data/Processed/MCF_Training_Set_Full.csv', low_memory = False)
output_df = mcf_final[['description', 'Predicted_SSOC_2020']]
output_df.columns = ['Description', 'SSOC 2020']

In [4]:
from ssoc_autocoder import processing

In [7]:
cleaned = []
for i, desc in enumerate(output_df['Description']):
    if (i+1) % 100 == 0:
        print(f'Processing {i+1}/{len(output_df)}...\r', end = '')
    try:
        cleaned_desc = processing.process_text(desc)
    except KeyboardInterrupt:
        break 
    except:
        print(f'Error found in index {i}...')
        cleaned_desc = processing.final_cleaning(desc)
    cleaned.append(cleaned_desc)

Error found in index 3071...
Error found in index 3688...
Error found in index 7005...
Error found in index 18283...
Error found in index 19336...
Error found in index 25188...
Error found in index 27221...
Error found in index 34863...
Error found in index 38110...
Error found in index 40663...
Error found in index 41018...
Processing 42800/42844...

In [8]:
output_df['Cleaned_Description'] = cleaned

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df['Cleaned_Description'] = cleaned


In [9]:
output_df.to_csv('Data/Processed/training/train_full.csv', index = False)

In [37]:
processing.final_cleaning(output_df['Description'][41018])

"1 Data entry duties and administrative duties include: Preparing quotation, final bill of quantity and sales invoices for the project you are assigned to. 2 Support project manager/ M&E coordinator. 3 Handling emails related to the project. 4 Ability to be on time and complete task in an efficient manner. 5 Manage databases and admin systems. 6 Organizing, storing paperwork, documents and computer-based documents. 7 Assisting in other office ad-ho duties when required. Skills Requirements. 1 Neat administrative and good organisational skills. 2 Able to thrive and work individually and independently. 3 Experienced and proficient in Excel. 4 Communication in English. 5 Possess ability to work on own initiative. 6 Fast learner. 7 Candidate must possess at least N's Level and above. Other information. 1 5.5 working days (alternate Saturdays off) (Mon - Fri: 8:30am to 5:30pm, Sat: 8:30am to 1pm) . 2 Location is in Kaki Bukit. 3 Full time job. 4 Work life balance. Kindly state your expected

In [38]:
output_df['Description'][41018]

'<p>Responsibilities\t\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>1\tData entry duties and administrative duties include: Preparing quotation, final bill of quantity and sales invoices for the project you are assigned to\t\t\t\t\t\t\t</p>\n<p>2\tSupport project manager/ M&amp;E coordinator\t\t\t\t\t\t\t</p>\n<p>3\tHandling emails related to the project\t\t\t\t\t\t\t</p>\n<p>4\tAbility to be on time and complete task in an efficient manner\t\t\t\t\t\t\t</p>\n<p>5\tManage databases and admin systems\t\t\t\t\t\t\t</p>\n<p>6\tOrganizing, storing paperwork, documents and computer-based documents\t\t\t\t\t\t\t</p>\n<p>7\tAssisting in other office ad-ho duties when required\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>Skills Requirements\t\t\t\t\t\t\t\t</p>\n<p>\t\t\t\t\t\t\t\t</p>\n<p>1\tNeat administrative and good organisational skills\t\t\t\t\t\t\t</p>\n<p>2\tAble to thrive and work individually and independently\t\t\t\t\t\t\t</p>\n<p>3\tExperienced and proficient in Excel\t\t\t\t\t