# Developing code for Data Processing

Last updated: 8th September 2021

Changelog:
* dd

### Importing libraries, data, and functions

In [1]:
import pandas as pd
import re
import spacy
import string
nlp = spacy.load('en_core_web_lg')

In [2]:
mcf_df = pd.read_csv("..\Data\Processed\WGS_Dataset_JobInfo_precleaned.csv")
mcf_df = mcf_df[["Job_ID", "Title", "Description", "SSOC_2015"]].sample(frac=0.1, random_state=1).reset_index(drop = True)
#mcf_df.to_csv("..\Data\Processed\Artifacts\Raw_Text.csv", index=False)

In [49]:
def remove_html_tags_newline(text):
    """
    Removes HTML and newline tags from a string with generic regex

    Parameters:
        text (str): Selected text

    Returns:
        cleaned_text(text) : Text with html tags and new line removed
    """

    clean = re.compile('<.*?>')
    newline_clean = re.compile('\n')
    return re.sub(newline_clean, ' ', re.sub(clean, '', text)).lower()

def lemmatize_remove_punct(doc):
    '''
    Take the `token.lemma_` of each non-stop word
    '''
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

def max_similarity(lst, word):
    return nlp(' '.join(lst)).similarity(nlp(word))

# Extracting based on ul tag to get title and description
def extracting_job_desc_ultag(text):
    """
    Extract job description using specific header text

    Parameters:
        text (str): Selected text

    Returns:
        list_extracted_text(list[text]): Extracted text
    """

    # Extract all lists in the HTML with a list tag (<ol> or <ul>)
    # Regex explanation: 
    # (?=<ol>|<ul>) is the lookahead for the <ol> or <ul> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</ol>|</ul>) is the lookbehind for the </ol> or </ul> tag
    list_pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=</ol>|</ul>)')
    list_elements = list_pattern.findall(text)

    # Check if there are any matches at all
    # If none, we return an empty list
    if len(list_elements) == 0:
        return []
    
    # Extract all the non-list items in the text
    # Iteratively split the text by the match until the last match
    non_list_elements = []
    text_placeholder = text
    for list_element in list_elements:
        non_list_elements.append(text_placeholder.split(match)[0])
        text_placeholder = text_placeholder.split(match)[1]
    if len(text_placeholder) != 0: # If last item is non-empty
        non_list_elements.append(text_placeholder)
    
    # Clean out all other HTML tags in the list items
    html_tag_pattern = re.compile()
    list_tags = ['<ul>' '<li>', '<ol>', '</ul>' '</li>', '</ol>']
    list_tags_repl = ['ul_element_op' 'li_element_op', 'ol_element_op', 'ul_element_cl' 'li_element_cl', 'ol_element_cl']
    for list_element in list_elements:
        for list_tag, list_tag_repl in zip(list_tags, list_tags_repl):
            list_element.replace(list_tag, list_tags_repl)
            re.sub('<.*?>', '')
            list_element.replace(list_tag_repl, list_tags)

            
    # Initialise an empty list to store our cleaned texts
    cleaned_texts = []

    # For each regex match
    for match in matches:

        # Split the entire HTML string on the closing </p> tag
        texts = match.split(r'</p>')

        # Take the last 2 items: second-last item is the <p> element
        # directly preceding the list, and the last item is the
        # HTML list that we are interested in.
        cleaned_texts.append({
            # Note: should merge the functions for lemmatisation and removing HTML into a single function with multiple options
            'job_title': lemmatize_remove_punct(nlp(remove_html_tags_newline(texts[-2]))),
            'job_description': remove_html_tags_newline(texts[-1]),
            'match': match
        })

    # Iterate over each cleaned text to generate the similarity score
    # Note: Should be merged in with the previous iteration
    for cleaned_text in cleaned_texts:
        relevance_score = max_similarity(cleaned_text['job_title'], "description duty responsibility role expect")
        irrelevance_score = max_similarity(cleaned_text['job_title'], "possess quality requirement skills ideal candidate")

        # max dissimilarity?
        cleaned_text['relevance_score'] = (relevance_score, irrelevance_score)

    # Create a list with all the relevance scores and find the
    # index of the top relevance scores
    relevance_scores_all = [cleaned_text['relevance_score'] for cleaned_text in cleaned_texts]
    most_relevant_text_idx = relevance_scores_all.index(max(relevance_scores_all))

    return cleaned_texts, cleaned_texts[most_relevant_text_idx]


### Processing the raw description data

In [50]:
output = mcf_df["Description"].apply(extracting_job_desc_ultag)
print(f"Coverage: {len(output[output.apply(len) != 0]) / len(output)*100:.2f}%")

  return nlp(' '.join(lst)).similarity(nlp(word))


Coverage: 1.34%


In [80]:
text = mcf_df["Description"][23545].replace('\n', '')
text

'<p>Job Description:</p> <p>\xa0</p> <ul> <li>Provide client servicing advice (face to face, phone and email) and all membership related administrative functions (including card printing and membership kit mailing) for the National Registry of Coaches (NROC)</li> </ul> <p><strong>\xa0</strong></p> <ul> <li>Coordinate and provide procurement, logistics and administrative support (such as registration, catering, letters, feedback collation and set up) for COACHSG’s courses, events and programmes<br /><br /></li> <li>Perform data entry and verification of information using in-house IT systems<br /><br /></li> <li>Contribute content on social media platforms (Facebook-CoachSG1 & Instagram-CoachSGOfficial) to generate publicity<br /><br /></li> <li>Conduct research for various initiatives and programmes (e.g. coaching statistics, coaching programmes from other countries) for policy and benchmarking purposes<br /><br /></li> </ul> <p>Manage feedback from coaches and stakeholders (e.g conduct

In [81]:
pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=<\/ol>|<\/ul>)')
matches = pattern.findall(text)
matches

['<ul> <li>Provide client servicing advice (face to face, phone and email) and all membership related administrative functions (including card printing and membership kit mailing) for the National Registry of Coaches (NROC)</li> </ul>',
 '<ul> <li>Coordinate and provide procurement, logistics and administrative support (such as registration, catering, letters, feedback collation and set up) for COACHSG’s courses, events and programmes<br /><br /></li> <li>Perform data entry and verification of information using in-house IT systems<br /><br /></li> <li>Contribute content on social media platforms (Facebook-CoachSG1 & Instagram-CoachSGOfficial) to generate publicity<br /><br /></li> <li>Conduct research for various initiatives and programmes (e.g. coaching statistics, coaching programmes from other countries) for policy and benchmarking purposes<br /><br /></li> </ul>']

In [378]:
def check_if_first_word_is_verb(string):
    override_false_list = ['proven']
    override_true_list = ['review' ,'responsible']
    if string.split(' ')[0].lower() in override_false_list:
        return False
    if string.split(' ')[0].lower() in override_true_list:
        return True
    if string.lower()[0:8] == 'you are':
        string = string.replace(string[0:8], '')
    return nlp(string)[0].pos_ == 'VERB'

#178
#308
text = mcf_df["Description"][318]

def clean_raw_string(string):
    
    # Identify some common problematic strings to remove
    to_remove = ['\n', '\xa0', '&nbsp;', '&amp;', '\t']
    for item in to_remove:
        string = string.replace(item, '')
        
    to_replace = [('No.', 'Number')]
    for item1, item2 in to_replace:
        string = string.replace(item1, item2)
        
    # Remove all non-unicode characters
    #string = ''.join([i if ord(i) < 128 else ' ' for i in string])
    
    return string

def process_li_tag(text):

    # Extract all lists in the HTML with a list tag (<ol> or <ul>)
    # Regex explanation: 
    # (?=<ol>|<ul>) is the lookahead for the <ol> or <ul> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</ol>|</ul>) is the lookbehind for the </ol> or </ul> tag
    list_pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=</ol>|</ul>)')
    list_elements = list_pattern.findall(text)
    
    if len(list_elements) == 0:
        return []
    
    # Iterate through each list item to see which list has more
    # items with a verb as the first word
    verb_scores = []
    for list_element in list_elements:
        
        # Use regex to split up the list into items
        list_items_pattern = re.compile(r'(?=<li>).*?(?<=</li>)')
        list_items = list_items_pattern.findall(list_element)
        
        # Initialise a count of number of items beginning with a verb
        count = 0
        
        # Iterate through each item in the list
        for list_item in list_items:
            
            # Remove all the HTML tags and check if the first word is a verb
            list_item = re.sub('<.*?>', '', list_item)
            if check_if_first_word_is_verb(list_item):
                count += 1
        
        # Append the verb score to the list
        # with a exception for very short lists
        if len(list_items) < 3:
            verb_scores.append(min(count/len(list_items), 0.5))
        else:
            verb_scores.append(count/len(list_items))
    
    # Return the list with maximum verb score, assuming at least
    # 50% of the list contains verbs (to avoid situations of only
    # a single list)
    if max(verb_scores) > 0.5:
        return list_elements[verb_scores.index(max(verb_scores))]
    else:
        return []
    
def process_p_list(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
            
    unicode_to_check = ['\u2022', '\u002d']
    unicode_presence = []
    for para_element in para_elements:
        
        # Remove all the HTML tags and check if the first word is a verb
        para_element_cleaned = re.sub('<.*?>', '', para_element).strip()
        
        # Check if the string is non-empty, and if it is, check if it has the unicodes
        if len(para_element_cleaned) > 0:
            unicode_presence.append(para_element_cleaned[0] in unicode_to_check)
        else:
            unicode_presence.append(False)
    
    # Initialise the lists
    output = []
    p_list = []
    
    # Iterate through the boolean list indicating if there is
    # a bullet point character at the start of the string
    for i, value in enumerate(unicode_presence):
        
        # If there is a bullet point character
        if value:
            
            # Append the string to the para list
            p_list.append(para_elements[i])
            
        # If there is no bullet point character
        else:
            
            # Append the para list if it is non-empty
            if len(p_list) > 0:
                output.append(' '.join(p_list))
            
            # Reset the para list
            p_list = []
            
    if len(output) == 0:
        return []

    # Iterate through each list item to see which list has more
    # items with a verb as the first word
    verb_scores = []
    for list_element in output:
        
        # Use regex to split up the list into items
        list_items_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
        list_items = list_items_pattern.findall(list_element)
        
        # Initialise a count of number of items beginning with a verb
        count = 0
        
        # Iterate through each item in the list
        for list_item in list_items:
            
            # Remove all the HTML tags and check if the first word is a verb
            list_item = re.sub('<.*?>', '', list_item).replace('\t', '').strip()
            if check_if_first_word_is_verb(list_item):
                count += 1
        
        # Append the verb score to the list
        # with a exception for very short lists
        if len(list_items) < 3:
            verb_scores.append(min(count/len(list_items), 0.5))
        else:
            verb_scores.append(count/len(list_items))
    print(verb_scores)
    # Return the list with maximum verb score, assuming at least
    # 50% of the list contains verbs (to avoid situations of only
    # a single list)
    if max(verb_scores) > 0.5:
        return output[verb_scores.index(max(verb_scores))]
    else:
        return []

def process_p_tag(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
    
    if len(para_elements) == 0:
        return []
    
    # Iterate through each paragraph element to see which one starts
    # with a verb, and we keep that paragraph element
    output = []
    for para_element in para_elements:

        # Remove all the HTML tags and check if the first word is a verb
        para_element_cleaned = re.sub("[^\w\s]", "", re.sub('<.*?>', '', para_element)).strip()
        if len(para_element_cleaned) > 0:
            if check_if_first_word_is_verb(para_element_cleaned):
                output.append(para_element)

    return " ".join(output)
            
def process_text(raw_text):
    
    # Remove problematic characters
    text = clean_raw_string(raw_text)
    
    li_results = process_li_tag(text)
    p_list_results = process_p_list(raw_text)
    p_results = process_p_tag(text)
    
    if len(li_results) > 0:
        return li_results
    elif len(p_list_results) > 0:
        return p_list_results
    elif len(p_results) > 0:
        return p_results
    else:
        return re.sub('<.*?>', ' ', text)
        
    
    
    # Extract all the non-list items in the text
    # Iteratively split the text by the match until the last match
#     
#     non_list_elements = []
#     text_placeholder = text
#     for list_element in list_elements:
#         non_list_elements.append(re.sub('<.*?>', '', text_placeholder.split(list_element)[0]).strip())
#         text_placeholder = text_placeholder.split(list_element)[1]
#     if len(text_placeholder) != 0: # If last item is non-empty
#         non_list_elements.append(text_placeholder)


#def clean_p_tag():
    
        #list_pattern = re.compile(r'(?=<li>|<p>).*?(?:(?<=</li>)|(?<=</p>))')


In [379]:
print(process_text(mcf_df["Description"][19277]))
print('----------------------------------------')
print(mcf_df["Description"][19277])

[0.375, 0.5, 0.3333333333333333]
<p>•Responsible for the design and preparation of technical proposal, generating system and equipment specifications for the Security and Traffic solutions in compliances for bid projects.</p> <p>•Prepare technical presentation, taking lead in technical reviews and clarification meetings with customers before and during bid phase</p> <p>•Work with vendors, analyze and identify best suitable product and maintain database of it for future use and group sharing.</p> <p>•Negotiate with stakeholders related to tender proposals, budgetary quotations, company’s advantage and operational availability.</p> <p>•Drafting and reviewing contracts between company and client in compliance with legal requirements</p> <p>•Keep abreast with updated technical knowledge of competitive solutions.</p> <p>•Identifying and mapping business strengths and customer needs</p> <p>•Handle various stages of solution sales from presales to fulfillment.</p> <p>•Performs technical suppo

In [272]:
ssoc = pd.read_csv('../Data/Raw/ssoc_v2018.csv', encoding='iso-8859-1')
ssoc.dropna(inplace = True)
ssoc['ssoc_f'] = ssoc['ssoc_f'].astype('float').astype('int').astype('str')
mcf_df = mcf_df[(mcf_df['SSOC_2015'] != 'X5000') & (mcf_df['SSOC_2015'].notnull())]
mcf_df['SSOC_2015'] = mcf_df['SSOC_2015'].astype('float').astype('int').astype('str')
mcf_data_final = mcf_df.merge(ssoc, left_on = 'SSOC_2015', right_on = 'ssoc_f', how = 'left')
mcf_data_final.rename({'ssoc_desc': "Reported SSOC Desc"}, axis = 1, inplace = True)

In [319]:
print(mcf_df['Description'][idx])
process_text(mcf_df['Description'][idx])

<p>If you are a Backend Developer using Node.JS , looking for an opportunity to work in a great team and advance your skills and career, we have the perfect role for you.</p>
<p>Mandatory Skill-set</p>
<ul>
  <li>Degree in Computer Science/ Computer Engineering or equivalent;</li>
  <li>Minimum 2 years experience as a Node.js developer, especially on large scale custom build, complex projects;</li>
  <li>Hands on in SQL Server, WCF, Web API, HTML, CSS,JavaScript;</li>
  <li>Should have good exposure in Web Applications Development using object oriented Programming;</li>
  <li>Familiarity with the whole web stack, including protocols and web server optimization techniques;</li>
  <li>Strong analytical skills and problem solving aptitude;</li>
  <li>A good team player with the ability to function independently;</li>
  <li>Good communication and documentation skills.</li>
</ul>
<p>Desired Skill-set</p>
<ul>
  <li>React JS experience.</li>
</ul>
<p>Responsibilities</p>
<ul>
  <li>Gather re

'<ul>  <li>Gather requirements from the business users and translate them into technical specifications;</li>  <li>Undertake product impact analysis, development, testing and support activities;</li>  <li>Responsible for developing technical specifications, coding, implementation, integration, documentation and user guide according to agreed standards;</li>  <li>Aid in troubleshooting problems and providing effective solutions for UAT and SIT;</li>  <li>Provide continuous post implementation support and enhancement based on business needs;</li>  <li>Working closely with co-developers, testers and various business users to develop and improve operations processes and procedures;</li>  <li>Participate in evaluation of technical risks and perform impact analysis to ensure effort estimate is accurate;</li>  <li>Be proactive in putting up recommendations to achieve better system performance and efficiency.</li></ul>'

In [293]:
check_if_first_word_is_verb('Selfmotivated with excellent interpersonal and communication skills')

True

In [330]:
'•	Responsible for the design and preparation of technical proposal, generating system and equipment specifications for the Security and Traffic solutions in compliances for bid projects.'[0] == '\u2022'

True

In [331]:
print('\u002d')

-


In [324]:
import random
idx = random.sample(mcf_df.index.tolist(), 1)[0]
#idx = 23272
#idx = 18514
#idx = 835
print(f"Index: {idx}")
print(f"Job title: {mcf_df['Title'][idx]}")
print(f"SSOC: {mcf_df['SSOC_2015'][idx]}")
print(f"Job tasks: {process_text(mcf_df['Description'][idx])}")
print("-----------------------")
print("==============================================")
print(mcf_df['Description'][idx])

Index: 19277
Job title: pre-sales engineer
SSOC: 24331
Job tasks: <p> 	Prepare technical presentation, taking lead in technical reviews and clarification meetings with customers before and during bid phase</p> <p> 	Work with vendors, analyze and identify best suitable product and maintain database of it for future use and group sharing.</p> <p> 	Negotiate with stakeholders related to tender proposals, budgetary quotations, company s advantage and operational availability.</p> <p> 	Drafting and reviewing contracts between company and client in compliance with legal requirements</p> <p> 	Keep abreast with updated technical knowledge of competitive solutions.</p> <p> 	Identifying and mapping business strengths and customer needs</p> <p> 	Handle various stages of solution sales from presales to fulfillment.</p> <p> 	Performs technical support duties, including product configuration and troubleshooting of customer issues during Project and Operations.</p> <p> 	Managing and retaining relatio

In [249]:
process_text(mcf_df["Description"][103])

'  Proven software development experience and Android skills development (minimum 3 years)  Proven working experience in Android app development (minimum 3 years)  Experience with Android SDK  Experience working with remote data via REST and JSON  Experience with third-party libraries and APIs  Working knowledge of the general mobile landscape, architectures, trends, and emerging technologies  Solid understanding of the full mobile development life cycle.  Proficient understanding of code versioning git  Familiarity with continuous integration'

In [250]:
mcf_df["Description"][103]

'<ul>\n  <li>Proven software development experience and Android skills development (minimum 3 years)</li>\n  <li>Proven working experience in Android app development (minimum 3 years)</li>\n  <li>Experience with Android SDK</li>\n  <li>Experience working with remote data via REST and JSON</li>\n  <li>Experience with third-party libraries and APIs</li>\n  <li>Working knowledge of the general mobile landscape, architectures, trends, and emerging technologies</li>\n  <li>Solid understanding of the full mobile development life cycle.</li>\n  <li>Proficient understanding of code versioning git</li>\n  <li>Familiarity with continuous integration</li>\n</ul>'

In [156]:
for token in nlp(output[0].split(' ')[0]):
    print(token.pos_)
    print(nlp(output[0].split(' ')[0])[0].pos_ == 'VERB')

NOUN
False


In [157]:
list_elements

['<ul>  <li>Qualified lawyer with a minimum of 3- 5yrs PQE in a common law jurisdiction including England &amp; Wales, Australian qualifications due to the cross border nature of our work</li>  <li>Experience in complex cross border and multi-jurisdicitonal banking work</li>  <li>Strong Technical banking knowledge</li>  <li>Excellent Client Relationships skills</li>  <li>Strong commercial/business acumen with the ability to deliver creative and pragmatic solutions and advice</li>  <li>Excellent communication and interpersonal skills, with the ability to interface at all levels and particularly to win the respect of the partner and fee-earner community, building strong relationships with both internal and external stakeholders</li>  <li>Collaborative team player with the ability to develop and work in a fast paced, intellectually rigorous environment</li></ul>']

In [158]:
non_list_elements

['Allen &amp; Overy is a leading global law firm operating in over thirty countries. Embracing new trends and harnessing decades of experience, we’ve earned our place at the forefront of the legal industry. Our partners are recognised as leaders in their field and our deals and cases often make headline business news. As a result, we’ve developed a reputation for delivering excellence, in all that we do. Today, we’re continuing to break new ground. By embracing new ways of thinking and integrating technology into our everyday work, we’ve been named as the Most Innovative Law Firm in Europe by the Financial Times six times. Join our team and you’ll be part of a flexible, inclusive culture underpinned by openness and acceptance. We’re driven by the belief that, to perform, people need support and space to collaborate. By combining those values with an ambitious outlook, we can give you the opportunity to thrive.Department &amp; RoleAllen &amp; Overy Singapore is a hub for the firms work 

In [38]:
output[output.apply(len) != 0]

196      ([{'job_title': [' ', 'ux', 'designer'], 'job_...
220      ([{'job_title': [' ', 'key', 'member', 'digita...
315      ([{'job_title': ['   ', 'coordinate', 'centre'...
320      ([{'job_title': ['responsibility'], 'job_descr...
456      ([{'job_title': [' ', 'key', 'responsibility',...
                               ...                        
22895    ([{'job_title': ['responsibility'], 'job_descr...
23191    ([{'job_title': [' ', 'human', 'resource'], 'j...
23272    ([{'job_title': [' ', 'expect'], 'job_descript...
23345    ([{'job_title': [' ', 'job', 'responsibility']...
23545    ([{'job_title': [' \xa0'], 'job_description': ...
Name: Description, Length: 317, dtype: object

In [115]:
mcf_df[mcf_df["Description"].str.contains('<ol>')]

Unnamed: 0,Job_ID,Title,Description,SSOC_2015
76,MCF-2020-0221485,business development manager,<p>Genesis-Global is a purveyor of fine luxury...,12212.0
77,MCF-2020-0078824,security officer #sgunitedjobs,<p><u><strong>Duties &amp; Responsibilities</s...,54142
94,MCF-2020-0103552,junior accountant #sgunitedtraineeships,<p><u><strong>Traineeship Description:</strong...,33130
136,MCF-2020-0142738,#sgunitedtraineeships executive (clinician rem...,<p><strong>Job Description:</strong></p>\n<p>T...,29090
160,MCF-2020-0024938,biofire market development manager aspac,<p><strong>Position Summary:</strong></p>\n<p>...,11202
...,...,...,...,...
23516,MCF-2020-0144392,associate (marketing & development) #sgunitedt...,<p><u><strong>Traineeship Description:</strong...,39990
23568,MCF-2020-0124783,founders office trainee associate #sgunitedtra...,<p><strong>Traineeship Description:</strong></...,13499
23569,MCF-2020-0175196,automation software engineer #sgunitedtrainees...,<p>The Analyst Programmer (Automation COE) wil...,25121
23593,MCF-2020-0066592,senior / lead mobile developer,<p>Do you believe your designs and code could ...,29090.0


In [116]:
mcf_df["Description"][76]

'<p>Genesis-Global is a purveyor of fine luxury gems and jewellery with an increasingly global outreach, whilst having business presence in Singapore and Hong Kong for more than 7 years.</p>\n<p>Having consolidated its operational activities into its new head office in Singapore in May 2019 as part of its new focus away from the Hong Kong market, Genesis-Global is now seeking to expand its team in Singapore to capture market share in the luxury gems market in Singapore and Europe, with special focus in expansion into the French luxury market as the initial footprint in Europe.</p>\n<p>With that in mind, the company now wishes to hire, on a full-time basis, a Business Development Manager with specific abilities and experience as more particularly described below, who shall already be physically located in Singapore as at the time of job application. The person should possess the following qualities:</p>\n<ol>\n  <li>At least 3 years of living and working in Singapore.</li>\n  <li>Had be

In [None]:
# to do
# replace <p> tags within <li> tags !
# pick <p> tag with closest match instead of preceding <p> tag only? or preceding 2 <p> tags
# if <p> tag is empty then hunt for another tag (maybe h1?)
# max length of <p> tag?
# need to capture all <li> tags instead of just those with <p> preceding
# should we just go straight to labelling?

In [100]:
mcf_df.iloc[18]['Description']

'<p>Unveiled in 1989, Expressions is a recognized pioneer in the beauty industry. It adopts the philosophy of total wellness, offering a wide range of facial and slimming treatment, as well as spa therapy. All of which are supported by high technology equipment and service-excellence standards.</p>\n<p><br></p>\n<p>Trainee will be stationed in the retail store setting to work with other retail staff and undergo training to nurture skills in retail operation management. Trainee will also be in direct interactions with the customers and be responsible to oversee customer service.</p>\n<p><br></p>\n<p>Responsibilities include, but not limited to:</p>\n<ul>\n  <li>Support the Operations Manager in the supervision of store operations and the management of retail manpower</li>\n  <li>Manage front desk operations (appointment system, point-of-sales system, credit card and digital payment terminals).</li>\n  <li>Attend to customer enquiries and advise customers on the benefits of massages</li>

In [96]:
import random
idx = random.sample(output[output.apply(len) != 0].index.tolist(), 1)[0]
#idx = 23272
#idx = 18514
#idx = 835
print(f"Index: {idx}")
for i in output[idx][0]:
    print(f"Job title: {i['job_title']}")
    print(f"Relevance score: {i['relevance_score']}")
    print(f"Job description: {i['job_description']}")
    print(f"Match: {i['match']}")
    print("-----------------------")
print("==============================================")
print(mcf_df.iloc[idx]['Description'])

Index: 6535
Job title: [' \xa0\xa0\xa0\xa0\xa0\xa0 ', 'job', 'responsibility']
Relevance score: (0.8285877897785132, 0.6670268372784524)
Job description:   support requests for data or data related collaboration or projects, and ensure that all deliverables and outcomes are accurate and timely; manage day-day data administration and management issues; expected to present imd and ica in data-driven collaborations/projects within ica and across wog respectively; assist in the project management of ica’s analytic projects and initiatives; manage the entire end to end lifecycle of an analytics project, including scoping of data inputs, data preparation, data pre-processing, feature engineering, basic data modelling, model deployment and monitoring; ensure projects adhere to ica’s data management framework, data quality and protection standards pertaining to ica’s information; responsible for the management of ica’s data resources; perform all other duties as and when assigned by the superv

In [52]:
output[320]

([{'job_title': ['responsibility'],
   'job_description': '  strong background in computer science, computer or electronics engineering, information technology or related technical discipline experience/knowledge in front-end and backend languages and libraries (e.g. html/ css, javascript, xml, jquery, java) experience in modern web application technology stack such as node.js, react.js, spring familiar with version-control software (e.g git, serena dimensions) familiar with best practices, such as tdd and ci/cd experience in an agile environment ',
   'match': '<p><strong><u>Responsibilities</u></strong></p> <ul> <li>Strong background in Computer Science, Computer or Electronics Engineering, Information Technology or related technical discipline</li> <li>Experience/Knowledge in front-end and backend languages and libraries (e.g. HTML/ CSS, JavaScript, XML, jQuery, Java)</li> <li>Experience in modern web application technology stack such as Node.js, React.js, Spring</li> <li>Familiar w