In [1]:
import json
import pandas as pd
import os
from typing import List
from datetime import datetime
import re

In [2]:
ML_JOB_PATH = "machine-learning-jobs20221109161820.jsonl"
DS_JOB_PATH = "sample20221109145023.jsonl"

RANDOM_STATE = 42

In [3]:
def validate_data_path(data_path:str):
    """Validates data path

    Args:
        data_path (str): path of the data

    Raises:
        FileNotFoundError: File does not exist
    """
    if not os.path.exists(data_path):
        raise FileNotFoundError(data_path)

In [4]:
# not all data point is perfect, for those with errors I will ignore

def return_data(data_path:str)->List:
    """Returns job detail from on job posting

    Args:
        data_path (str): Path of the data

    Returns:
        List: A list full of the job posting, each a dictionary
    """
    validate_data_path(data_path)
    data_list = []
    with open(data_path, 'r') as reader:
        for line in reader.readlines():
            data = json.loads(line)
            # Errors are only contained in the dictionary
            if isinstance(data,dict):
                continue
            data_list.append(data)
    return data_list

In [5]:
ML_JOB_RAW_DATA = return_data(ML_JOB_PATH)
DS_JOB_RAW_DATA = return_data(DS_JOB_PATH)

In [6]:
len(ML_JOB_RAW_DATA)

224

In [7]:
len(DS_JOB_RAW_DATA)

1212

In [8]:
TOTAL_JOB_RAW_DATA = ML_JOB_RAW_DATA + DS_JOB_RAW_DATA

In [9]:
len(TOTAL_JOB_RAW_DATA)

1436

In [10]:
# Attempting to fetch headers from their respective cols
headers = {}
for row in TOTAL_JOB_RAW_DATA:
    for col in range(1,len(row)):
        for key in row[col].keys():
            if str(key) + '_' + str(col) in headers:
                headers[f"{key}_{col}"] +=1
            else: 
                headers[f"{key}_{col}"] = 1

In [11]:
len(headers)

1299

In [12]:
# This is a slice of all of the headers found coz otherwise it will be too big
headers_slice = {k:headers[k] for k in list(headers)[:15]}

In [13]:
headers_slice

{'company_name_1': 1436,
 'small_section_2': 1436,
 'Responsibilities:\xa0_3': 6,
 '\xa0_3': 10,
 'Essential Skills:_3': 2,
 'Career Level_4': 1436,
 'Qualification_4': 1434,
 'Years of Experience_4': 1241,
 'Job Type_4': 1436,
 'Job Specializations_4': 1436,
 'Registration No._4': 1429,
 'Company Size_4': 1411,
 'Industry_4': 1434,
 'Benefits & Others_4': 1151,
 'Company Overview_5': 1419}

In [14]:
# These headers have values for all jobs

full_header = [k for k,v in headers.items() if v == len(TOTAL_JOB_RAW_DATA)]
full_header

['company_name_1',
 'small_section_2',
 'Career Level_4',
 'Job Type_4',
 'Job Specializations_4',
 'url_6']

I am expecting all information to be filled in:
- First col: Job position
- Second col: Details of the job including pay, location (General and or specific), 
- Fourth col: Misc details of the company including size of company, dress code etc
- Fifth col: Company Overview, details on the company
- Sixth col: URL

Within the second col I expect there are also some headers to be further broken down. 

The largest one will have to be the second col with the job description being the most variable component of the posting. I suspect most of the work will be done in here to uncover as much information as possible.

The desired outcome will be a consolidated pandas dataframe with as little information loss as possible.

# Preprocess

## Full Columns

In [15]:
# We do see some artifacts in the job titles like
# - Job reference number
# - Hashtags

job_reference_pattern = r"\(?\w+\d+\)?"
hashtag_pattern = r"#\w+"

data = {}
job_title,company_name,career_level,job_type, job_specialisations, url = [],[],[],[],[],[]
for row in TOTAL_JOB_RAW_DATA:
    position = row[0]['position']
    position = re.sub(job_reference_pattern, "",position)
    position = re.sub(hashtag_pattern, "",position)
    position = position.replace("*","").replace("-","")
    company_name.append(row[1]['company_name'])
    job_title.append(position)
    career_level.append(row[4]['Career Level'])
    job_type.append(row[4]['Job Type'])
    job_specialisations.append(row[4]['Job Specializations'])
    url.append(row[6]['url'])

data['job_title'] = job_title
data['company_name'] = company_name
data['career_level'] = career_level
data['job_type'] = job_type
data['job_specialisations'] = job_specialisations
data['url'] = url

assert (len(job_title) == len(career_level)==len(job_type)==len(job_specialisations)==len(url))

In [16]:
# Already we can see some field contains some other information like the job_title containing job type, location and company name for instance
pd.DataFrame.from_dict(data).tail()

Unnamed: 0,job_title,company_name,career_level,job_type,job_specialisations,url
1431,"Research Associate (Bioprinting),",Nanyang Technological University,Junior Executive,Full-Time,"Sciences, Science & Technology",http://www.jobstreet.com.sg/en/job/research-as...
1432,"Data Science Analyst, TikTok",TikTok,Entry Level,Full-Time,"Computer/Information Technology, IT-Software",http://www.jobstreet.com.sg/en/job/data-scienc...
1433,"Data Analyst Senior (1 year contract, Central)",PERSOLKELLY Singapore Pte Ltd (Formerly Kelly ...,Senior Executive,Temporary,"Manufacturing, Purchasing/Material Mgmt",http://www.jobstreet.com.sg/en/job/data-analys...
1434,Junior Machine Learning Engineer,Dell Asia Pte Ltd,Junior Executive,Full-Time,"Computer/Information Technology, IT-Software",http://www.jobstreet.com.sg/en/job/junior-mach...
1435,Senior Process Development Scientist (Organic ...,PERSOLKELLY Singapore Pte Ltd (Formerly Kelly ...,Senior Executive,Full-Time,"Sciences, Chemistry",http://www.jobstreet.com.sg/en/job/senior-proc...
