# Developing code for Data Processing

Last updated: 10th September 2021

### Importing libraries and data

In [1]:
import pandas as pd
import re
import spacy
import string
nlp = spacy.load('en_core_web_lg')

In [2]:
mcf_df = pd.read_csv("..\Data\Processed\WGS_Dataset_JobInfo_precleaned.csv")
mcf_df = mcf_df[["Job_ID", "Title", "Description", "SSOC_2015"]].sample(frac=0.1, random_state=1).reset_index(drop = True)

### Writing our functions

To Ben: Can you help me check through the code? If it's not clear or potentially buggy, please flag it to me so I can fix it. Thanks!

In [563]:
def check_if_first_word_is_verb(string):
    """
    Checks if the first word of the string is a verb
    
    Parameters:
        string (str): Text to check for
        
    Returns:
        Whether the first word is a verb or not
    """
    
    # Define some words that should be False
    # regardless of what Spacy says
    override_false_list = ['proven', 'possess']
    
    # Define some words that should be True
    # regardless of what Spacy says
    override_true_list = ['review' ,'responsible', 'design', 'to', 'able']
    
    # If the string is a zero length, return False
    if len(string) == 0:
        return False
    
    # If the first word is in the override false list, return False
    if string.split(' ')[0].lower() in override_false_list:
        return False
    
    # If the first word is in the override True list, return True
    if string.split(' ')[0].lower() in override_true_list:
        return True
    
    # If the first two words are "you are", we truncate it
    if string.lower()[0:8] == 'you are':
        string = string[8:]
        
    # Check if the first word is a verb
    return nlp(string)[0].pos_ == 'VERB'

def clean_raw_string(string):
    """
    Cleans the raw text from problematic strings or abbreviations
    
    Parameters:
        string (str): Text to clean for
    
    Returns:
        Cleaned text without problematic strings or abbreviations
    """
    
    # Identify some common problematic strings to remove
    to_remove = ['\n', '\xa0', '&nbsp;', '&amp;', '\t', '&rsquo;']
    
    # Remove these strings
    for item in to_remove:
        string = string.replace(item, '')
        
    # Identify some common abbreviations to replace
    to_replace = [('No.', 'Number')]
    
    # Replace these strings
    for item1, item2 in to_replace:
        string = string.replace(item1, item2)
        
    # Remove all non-unicode characters
    # Deprecated due to reliance on bullet points
    # string = ''.join([i if ord(i) < 128 else ' ' for i in string])
    
    return string

def clean_html_unicode(string):
    
    # Initialise the output string
    cleaned_string = string
    
    # This is run in order, so be careful!
    cleaning_regex= ['<.*?>', '^\d+\.{0,1}', '[^\w\s]']
    
    # Iteratively apply each regex
    for regex in cleaning_regex:
        cleaned_string = re.sub(regex, '', cleaned_string).strip()
    
    return cleaned_string

def check_list_for_verbs(list_elements):

    # Initialise a list to store the output
    verb_scores = []
    
    # Iterate through each of the list elements passed in
    for list_element in list_elements:
        
        # Use regex to split up the list into items
        # Note this depends on whether the list elements
        # passed in are lists (ol/ul) or paragraph lists (p)
        if list_element[0:4] in ['<ul>', '<ol>']:
            list_items_pattern = re.compile(r'(?=<li>).*?(?<=</li>)')
        else:
            list_items_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')

        # Split each list up into the constituent items
        list_items = list_items_pattern.findall(list_element)
        
        # Initialise a count of number of items beginning with a verb
        count = 0
        
        # Iterate through each item in the list
        for list_item in list_items:
            
            # Remove all the HTML tags and check if the first word is a verb
            list_item = clean_html_unicode(list_item)
            #list_item = re.sub("[^\w\s]", "", re.sub('^\d+\.{0,1}', '', re.sub('<.*?>', '', list_item.replace('\t', '')).strip())).strip()
            
            # Check if the first word is a verb, and add to score if it is
            if check_if_first_word_is_verb(list_item):
                count += 1
        
        # Add the list length and verb score to the output
        verb_scores.append((len(list_items), count/len(list_items)))
        
    # Initialise the list to store the new set of 
    # list elements which we are merging if they
    # are short lists with lots of verbs
    for_recursive = []

    # Iterating over each verb score
    for i, verb_score in enumerate(verb_scores):
        
        # Always append the first list item
        if i == 0:
            for_recursive.append(list_elements[i])
        
        # For other items, check if there are less than 6 items
        # and the verb score is at least 70%. If so, we merge it
        elif (verb_score[0] < 6) and (verb_score[1] >= .7):
                        
            # Remove the starting list tag if it is included
            list_elements_i_cleaned = re.sub(r'(<ul>|<ol>|</ul>|</ol>)', '', list_elements[i])

            # If the preceding list has a </ul> or </ol> tag
            # then we should remove it before concatenating the
            # strings, but otherwise we just concat the strings directly
            if for_recursive[-1][:-5] in ['</ul>', '</ol>']:
                for_recursive[-1] = for_recursive[-1][:-5] + " " + list_elements_i_cleaned + for_recursive[-1][-5:]
            else:
                for_recursive[-1] += list_elements[i]
            
        # Otherwise we just append it back to the list
        else:
            for_recursive.append(list_elements[i])
            
    # Run the recursive function if we have merged some lists together
    if len(for_recursive) != len(list_elements):
        return check_list_for_verbs(for_recursive)
        
    # Otherwise, we output the verb scores
    else:

        # Append the verb score to the list
        # with a exception for very short lists
        final_verb_scores = []
        for verb_score in verb_scores:
            
            # If the length is less than 3
            if verb_score[0] < 3:
                final_verb_scores.append(min(verb_score[1], 0.5)) # cap the score at 50%
            else:
                final_verb_scores.append(verb_score[1])

        # Return the list with maximum verb score, assuming at least
        # 50% of the list contains verbs
        if max(final_verb_scores) >= 0.5:
            return list_elements[final_verb_scores.index(max(final_verb_scores))]
        else:
            return []

def process_li_tag(text):

    # Extract all lists in the HTML with a list tag (<ol> or <ul>)
    # Regex explanation: 
    # (?=<ol>|<ul>) is the lookahead for the <ol> or <ul> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</ol>|</ul>) is the lookbehind for the </ol> or </ul> tag
    list_pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=</ol>|</ul>)')
    list_elements = list_pattern.findall(text)
    
    if len(list_elements) == 0:
        return []
    
    return check_list_for_verbs(list_elements)
    
def process_p_list(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
            
    # Check for specific unicode characters that can be used as bullet points
    unicode_to_check = ['\u2022', '\u002d', '\u00b7']
    bullet_pt_presence = []
    for para_element in para_elements:
        
        # Remove all the HTML tags
        para_element_cleaned = re.sub('<.*?>', '', para_element).strip()
        
        # Check if the string is non-empty
        if len(para_element_cleaned) > 0:
            
            # Check if the first character has any bullet points
            result1 = para_element_cleaned[0] in unicode_to_check
            
            # Check if the first character is a numbered list
            # by checking if the re.match() returns anything
            result2 = re.match(r'^\d+\.', para_element_cleaned) is not None
            
            bullet_pt_presence.append(result1 or result2)
            
        # If it is empty, then it doesn't contain any bullet points
        else:
            bullet_pt_presence.append(False)
    
    # Initialise the lists
    output = []
    p_list = []
    
    # Build an equivalent list of list items by iterating
    # through the boolean list indicating if there is
    # a bullet point character at the start of the string
    for i, value in enumerate(bullet_pt_presence):
        
        # If there is a bullet point character
        if value:
            
            # Append the string to the para list
            p_list.append(para_elements[i])
            
        # If there is no bullet point character
        else:
            
            # Append the para list if it is non-empty
            if len(p_list) > 0:
                output.append(' '.join(p_list))
            
            # Reset the para list
            p_list = []
            
    if len(output) == 0:
        return []
    
    return check_list_for_verbs(output)

def process_p_tag(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
    
    if len(para_elements) == 0:
        return []
    
    # Iterate through each paragraph element to see which one starts
    # with a verb, and we keep that paragraph element
    output = []
    for para_element in para_elements:

        # Remove all the HTML tags and check if the first word is a verb
        para_element_cleaned = re.sub("[^\w\s]", "", re.sub('<.*?>', '', para_element)).strip()
        if len(para_element_cleaned) > 0:
            if check_if_first_word_is_verb(para_element_cleaned):
                output.append(para_element)

    return " ".join(output)
            
def process_text(raw_text):
    
    # Remove problematic characters
    text = clean_raw_string(raw_text)
    
    li_results = process_li_tag(text)
    p_list_results = process_p_list(raw_text)
    p_results = process_p_tag(text)
    
    if len(li_results) > 0:
        print('List object detected')
        return li_results
    elif len(p_list_results) > 0:
        print('Paragraph list detected')
        return p_list_results
    elif len(p_results) > 0:
        print('Paragraphs detected')
        return p_results
    else:
        print('None detected, returning all')
        return re.sub('<.*?>', ' ', text)
        


### Checking the functions

In [572]:
print(process_text(mcf_df["Description"][835]))
print('----------------------------------------')
print(mcf_df["Description"][835])

List object detected
<ul> <li>Oversee the technical team to ensure examination materials and learning materials production are running smoothly.</li> <li>Oversee and monitor that the examination materials are set conducted in accordance to standard operating procedures and policies.</li> <li>Oversee all examination setting and marking contracts managed and maintain by the team and vendors.</li> <li>Identify gaps or deficiencies in standard operating procedures and suggest improvements.</li> <li>Redesign processes to strengthen controls and or to improve productivity.</li> <li>Manage and build relationship with the Learning and Assessment Committees, Professional Educational Councilandthe examination teams.</li> </ul><ul> <li>Oversee the team to ensure that the modules remain up to date, relevant and meet the needs of the sector.</li> <li>Perform appropriate periodic benchmarkingto ensure that the modules and examination materials, and learning materials are in line with international s

In [578]:
# Consider edge cases
# 23021 - starts with manual numbering, need to remove numbers before processing # ok addressed
# 18218 - multiple lists (ul list) # ok addressed
# 817 - multiple lists (p list) # ok addressed
# 12802 - 'possess'
# 1657 - combined list
# 11007 - 'process', 'monitor', 'update' not recognised as verbs
# 5361 - should have picked the first list but didnt
# 9458 - should have picked up both lists but didn't
# 13850, 21925 - didn't pick the first list, para should have a minimum element
# 1566 - should have picked list

In [588]:
import random
idx = random.sample(mcf_df.index.tolist(), 1)[0]
print(f"Index: {idx}")
print(f"Job title: {mcf_df['Title'][idx]}")
print(f"SSOC: {mcf_df['SSOC_2015'][idx]}")
print("-----------------------")
print(f"Job tasks: {process_text(mcf_df['Description'][idx])}")
print("==============================================")
print(mcf_df['Description'][idx])

Index: 1566
Job title: web content / marketing collaterals writer
SSOC: 26419
-----------------------
Paragraphs detected
Job tasks: <p>Beginning with the essential understanding of specific businesses plus learning the ins and outs of our trade and discussing key elements of our events, you'll have to write persuasive, attention grabbing, compelling, search engine optimized content for website and marketing collaterals for various summits.</p>
<p>Beginning with the essential understanding of specific businesses plus learning the ins and outs of our trade and discussing key elements of our events, you'll have to write persuasive, attention grabbing, compelling, search engine optimized content for website and marketing collaterals for various summits.</p>
<p><br></p>
<p>Requirements</p>
<p><br></p>
<ul>
  <li>&nbsp;&nbsp;&nbsp;&nbsp;Proven writing, editing and proofreading skills</li>
  <li>&nbsp;&nbsp;&nbsp;&nbsp;Some relevant experience in business writing, advertising, journalism, or

### Archived Code

In [None]:
output = mcf_df["Description"].apply(process_text)
print(f"Coverage: {len(output[output.apply(len) != 0]) / len(output)*100:.2f}%")

In [80]:
text = mcf_df["Description"][23545].replace('\n', '')
text

'<p>Job Description:</p> <p>\xa0</p> <ul> <li>Provide client servicing advice (face to face, phone and email) and all membership related administrative functions (including card printing and membership kit mailing) for the National Registry of Coaches (NROC)</li> </ul> <p><strong>\xa0</strong></p> <ul> <li>Coordinate and provide procurement, logistics and administrative support (such as registration, catering, letters, feedback collation and set up) for COACHSG’s courses, events and programmes<br /><br /></li> <li>Perform data entry and verification of information using in-house IT systems<br /><br /></li> <li>Contribute content on social media platforms (Facebook-CoachSG1 & Instagram-CoachSGOfficial) to generate publicity<br /><br /></li> <li>Conduct research for various initiatives and programmes (e.g. coaching statistics, coaching programmes from other countries) for policy and benchmarking purposes<br /><br /></li> </ul> <p>Manage feedback from coaches and stakeholders (e.g conduct

In [81]:
pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=<\/ol>|<\/ul>)')
matches = pattern.findall(text)
matches

['<ul> <li>Provide client servicing advice (face to face, phone and email) and all membership related administrative functions (including card printing and membership kit mailing) for the National Registry of Coaches (NROC)</li> </ul>',
 '<ul> <li>Coordinate and provide procurement, logistics and administrative support (such as registration, catering, letters, feedback collation and set up) for COACHSG’s courses, events and programmes<br /><br /></li> <li>Perform data entry and verification of information using in-house IT systems<br /><br /></li> <li>Contribute content on social media platforms (Facebook-CoachSG1 & Instagram-CoachSGOfficial) to generate publicity<br /><br /></li> <li>Conduct research for various initiatives and programmes (e.g. coaching statistics, coaching programmes from other countries) for policy and benchmarking purposes<br /><br /></li> </ul>']

In [553]:
def check_if_first_word_is_verb(string):
    override_false_list = ['proven', 'possess']
    override_true_list = ['review' ,'responsible', 'design', 'to', 'able']
    if len(string) == 0:
        return False
    if string.split(' ')[0].lower() in override_false_list:
        return False
    if string.split(' ')[0].lower() in override_true_list:
        return True
    if string.lower()[0:8] == 'you are':
        string = string.replace(string[0:8], '')
    return nlp(string)[0].pos_ == 'VERB'

#178
#308
text = mcf_df["Description"][318]

def clean_raw_string(string):
    
    # Identify some common problematic strings to remove
    to_remove = ['\n', '\xa0', '&nbsp;', '&amp;', '\t', '&rsquo;']
    for item in to_remove:
        string = string.replace(item, '')
        
    to_replace = [('No.', 'Number')]
    for item1, item2 in to_replace:
        string = string.replace(item1, item2)
        
    # Remove all non-unicode characters
    #string = ''.join([i if ord(i) < 128 else ' ' for i in string])
    
    return string

def check_list_for_verbs(list_elements):
    print(list_elements)
    verb_scores = []
    for list_element in list_elements:
        
        # Use regex to split up the list into items

        if list_element[0:4] in ['<ul>', '<ol>']:
            list_items_pattern = re.compile(r'(?=<li>).*?(?<=</li>)')
        else:
            list_items_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')

        list_items = list_items_pattern.findall(list_element)
        
        # Initialise a count of number of items beginning with a verb
        count = 0
        
        # Iterate through each item in the list
        for list_item in list_items:
            
            # Remove all the HTML tags and check if the first word is a verb
            list_item = re.sub("[^\w\s]", "", re.sub('^\d+\.{0,1}', '', re.sub('<.*?>', '', list_item.replace('\t', '')).strip())).strip()
            if check_if_first_word_is_verb(list_item):
                count += 1
        
        verb_scores.append((len(list_items), count/len(list_items)))
        
    for_recursive = []
    print(verb_scores)
    for i, verb_score in enumerate(verb_scores):
        
        # Always append the first list item
        if i == 0:
            for_recursive.append(list_elements[i])
        
        # For other items, check if there are less than 6 items
        # and the verb score is 100%. If so, we merge it
        elif (verb_score[0] < 6) and (verb_score[1] >= .6):
                        
            # Remove the starting list tag if it is included
            list_elements_i_cleaned = re.sub(r'(<ul>|<ol>|</ul>|</ol>)', '', list_elements[i])
            print(list_elements_i_cleaned)
            # If the preceding list has a </ul> or </ol> tag
            # then we should remove it before concatenating the
            # strings, but otherwise we just concat the strings directly
            
            if for_recursive[-1][:-5] in ['</ul>', '</ol>']:
                for_recursive[-1] = for_recursive[-1][:-5] + " " + list_elements_i_cleaned + for_recursive[-1][-5:]
            else:
                for_recursive[-1] += list_elements[i]
            
        # Otherwise we just append it back to the list
        else:
            for_recursive.append(list_elements[i])
            
    # Run the recursive function if we have merged some lists together
    if len(for_recursive) != len(list_elements):
        return check_list_for_verbs(for_recursive)
        
    # Otherwise, we output the verb scores
    else:

        # Append the verb score to the list
        # with a exception for very short lists
        final_verb_scores = []
        for verb_score in verb_scores:
            
            if verb_score[0] < 3:
                final_verb_scores.append(min(verb_score[1], 0.5))
            else:
                final_verb_scores.append(verb_score[1])

        # Return the list with maximum verb score, assuming at least
        # 50% of the list contains verbs (to avoid situations of only
        # a single list)
        if max(final_verb_scores) > 0.5:
            return list_elements[final_verb_scores.index(max(final_verb_scores))]
        else:
            return []

def process_li_tag(text):

    # Extract all lists in the HTML with a list tag (<ol> or <ul>)
    # Regex explanation: 
    # (?=<ol>|<ul>) is the lookahead for the <ol> or <ul> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</ol>|</ul>) is the lookbehind for the </ol> or </ul> tag
    list_pattern = re.compile(r'(?=<ol>|<ul>).*?(?<=</ol>|</ul>)')
    list_elements = list_pattern.findall(text)
    
    if len(list_elements) == 0:
        return []
    
    return check_list_for_verbs(list_elements)
    
#     # Iterate through each list item to see which list has more
#     # items with a verb as the first word
#     verb_scores = []
#     for list_element in list_elements:
        
#         # Use regex to split up the list into items
#         list_items_pattern = re.compile(r'(?=<li>).*?(?<=</li>)')
#         list_items = list_items_pattern.findall(list_element)
        
#         # Initialise a count of number of items beginning with a verb
#         count = 0
        
#         # Iterate through each item in the list
#         for list_item in list_items:
            
#             # Remove all the HTML tags and check if the first word is a verb
#             list_item = re.sub('<.*?>', '', list_item)
#             if check_if_first_word_is_verb(list_item):
#                 count += 1
        
#         # Append the verb score to the list
#         # with a exception for very short lists
#         if len(list_items) < 3:
#             verb_scores.append(min(count/len(list_items), 0.5))
#         else:
#             verb_scores.append(count/len(list_items))
    
def process_p_list(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
            
    # Check for specific unicode characters that can be used as bullet points
    unicode_to_check = ['\u2022', '\u002d', '\u00b7']
    bullet_pt_presence = []
    for para_element in para_elements:
        
        # Remove all the HTML tags
        para_element_cleaned = re.sub('<.*?>', '', para_element).strip()
        
        # Check if the string is non-empty
        if len(para_element_cleaned) > 0:
            
            # Check if the first character has any bullet points
            result1 = para_element_cleaned[0] in unicode_to_check
            
            # Check if the first character is a numbered list
            # by checking if the re.match() returns anything
            result2 = re.match(r'^\d+\.', para_element_cleaned) is not None
            
            bullet_pt_presence.append(result1 or result2)
            
        # If it is empty, then it doesn't contain any bullet points
        else:
            bullet_pt_presence.append(False)
    
    # Initialise the lists
    output = []
    p_list = []
    
    # Build an equivalent list of list items by iterating
    # through the boolean list indicating if there is
    # a bullet point character at the start of the string
    for i, value in enumerate(bullet_pt_presence):
        
        # If there is a bullet point character
        if value:
            
            # Append the string to the para list
            p_list.append(para_elements[i])
            
        # If there is no bullet point character
        else:
            
            # Append the para list if it is non-empty
            if len(p_list) > 0:
                output.append(' '.join(p_list))
            
            # Reset the para list
            p_list = []
            
    if len(output) == 0:
        return []
    
    return check_list_for_verbs(output)

#     # Iterate through each list item to see which list has more
#     # items with a verb as the first word
#     verb_scores = []
#     for list_element in output:
        
#         # Use regex to split up the list into items
#         list_items_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
#         list_items = list_items_pattern.findall(list_element)
        
#         # Initialise a count of number of items beginning with a verb
#         count = 0
        
#         # Iterate through each item in the list
#         for list_item in list_items:
            
#             # Remove all the HTML tags and check if the first word is a verb
#             # !!! this needs to be cleaned up, very messy
#             # consider changing \d to \w to include a b c lists
#             list_item = re.sub("[^\w\s]", "", re.sub('^\d+\.{0,1}', '', re.sub('<.*?>', '', list_item.replace('\t', '')).strip())).strip()
#             if check_if_first_word_is_verb(list_item):
#                 count += 1
        
#         # Append the verb score to the list
#         # with a exception for very short lists
#         if len(list_items) < 3:
#             verb_scores.append(min(count/len(list_items), 0.5))
#         else:
#             verb_scores.append(count/len(list_items))
#     print(verb_scores)
    
#     # Return the list with maximum verb score, assuming at least
#     # 50% of the list contains verbs (to avoid situations of only
#     # a single list)
#     if max(verb_scores) > 0.5:
#         return output[verb_scores.index(max(verb_scores))]
#     else:
#         return []

def process_p_tag(text):
    
    # Extract all lists in the HTML with a paragraph tag (<p>)
    # Regex explanation: 
    # (?=<p>) is the lookahead for the <p> tag
    # .* captures everything between the tags, ? restricts it to capturing one set only
    # (?<=</p>) is the lookbehind for the </p> tag
    para_pattern = re.compile(r'(?=<p>).*?(?<=</p>)')
    para_elements = para_pattern.findall(text)
    
    if len(para_elements) == 0:
        return []
    
    # Iterate through each paragraph element to see which one starts
    # with a verb, and we keep that paragraph element
    output = []
    for para_element in para_elements:

        # Remove all the HTML tags and check if the first word is a verb
        para_element_cleaned = re.sub("[^\w\s]", "", re.sub('<.*?>', '', para_element)).strip()
        if len(para_element_cleaned) > 0:
            if check_if_first_word_is_verb(para_element_cleaned):
                output.append(para_element)

    return " ".join(output)
            
def process_text(raw_text):
    
    # Remove problematic characters
    text = clean_raw_string(raw_text)
    
    li_results = process_li_tag(text)
    p_list_results = process_p_list(raw_text)
    p_results = process_p_tag(text)
    
    if len(li_results) > 0:
        print('List object detected')
        return li_results
    elif len(p_list_results) > 0:
        print('Paragraph list detected')
        return p_list_results
    elif len(p_results) > 0:
        print('Paragraphs detected')
        return p_results
    else:
        print('None detected, returning all')
        return re.sub('<.*?>', ' ', text)
        
    
    
    # Extract all the non-list items in the text
    # Iteratively split the text by the match until the last match
#     
#     non_list_elements = []
#     text_placeholder = text
#     for list_element in list_elements:
#         non_list_elements.append(re.sub('<.*?>', '', text_placeholder.split(list_element)[0]).strip())
#         text_placeholder = text_placeholder.split(list_element)[1]
#     if len(text_placeholder) != 0: # If last item is non-empty
#         non_list_elements.append(text_placeholder)


#def clean_p_tag():
    
        #list_pattern = re.compile(r'(?=<li>|<p>).*?(?:(?<=</li>)|(?<=</p>))')


In [533]:
print(process_text(mcf_df["Description"][18218]))
print('----------------------------------------')
print(mcf_df["Description"][18218])

['<ul>  <li>Build relationships with customers and talents (staffs) through informal meetings, networking and team building activities.</li>  <li>Seek business opportunities from customers and fulfill their business needs promptly in order to convert them into sales.</li>  <li>Strategize, develop, plan and execute the talent sourcing program to support the project needs.</li>  <li>Work on tenders, proposals and quotations to support the customers’ business need.</li></ul>', '<ul>  <li>Strategize, develop, plan and execute the talent sourcing and customer development programs to support the revenue target.</li>  <li>Create and conduct branding, as well as, marketing on relevant social media and internet platforms to enhance our image and generate more public awareness about our expertise and services.</li>  <li>Assist in the recruitment of ICT / Healthcare / Education / Science / Research Talents for projects and customers’ needs.</li></ul>', '<ul>  <li>Perform regular review of the bus

In [534]:
print(process_text(mcf_df["Description"][817]))
print('----------------------------------------')
print(mcf_df["Description"][817])

['<p>•\tDecoupling growth from environmental impact.</p> <p>•\tHelping more than a billion people take action to improve their health and well-being.</p> <p>•\tEnhancing the livelihoods of millions of people by 2020.</p>', '<p>•\tHolds visibility of Volume plan, forecast and drive delivery of volume forecasts</p> <p>•\tChannelize Business Development Managers (BDM)/Customers input into Sales &amp; Operations Planning (S&amp;OP) process</p> <p>•\tDefines volume implications of the brand marketing plan by channel or key customer (up/down elevators)</p> <p>•\tManages agreed customer spends as per agreed budgets</p> <p>•\tBrainstorms and evaluates gap closure opportunities with account management, including customer specific special packs and displays</p>', '<p>•\tConsolidates, Drives &amp; follow up on execution of the Key Innovations projects</p> <p>•\tDevelops key input to innovation forecast with BDMs</p> <p>•\tTracks Innovation Success KPIs (Actual vs Business Case numbers, on time de

In [446]:
ssoc = pd.read_csv('../Data/Raw/ssoc_v2018.csv', encoding='iso-8859-1')
ssoc.dropna(inplace = True)
ssoc['ssoc_f'] = ssoc['ssoc_f'].astype('float').astype('int').astype('str')
mcf_df = mcf_df[(mcf_df['SSOC_2015'] != 'X5000') & (mcf_df['SSOC_2015'].notnull())]
mcf_df['SSOC_2015'] = mcf_df['SSOC_2015'].astype('float').astype('int').astype('str')
mcf_data_final = mcf_df.merge(ssoc, left_on = 'SSOC_2015', right_on = 'ssoc_f', how = 'left')
mcf_data_final.rename({'ssoc_desc': "Reported SSOC Desc"}, axis = 1, inplace = True)

In [548]:
check_if_first_word_is_verb('Supporting delivery of curricular programmes to students as required by the school&rsquo;s academic board.')

True

In [541]:
# Consider edge cases
# 23021 - starts with manual numbering, need to remove numbers before processing # ok addressed
# 18218 - multiple lists (ul list) # ok addressed
# 817 - multiple lists (p list) # ok addressed
# 12802 - 'possess'
# 1657 - combined list
# 11007 - 'process', 'monitor', 'update' not recognised as verbs
# 5361 - should have picked the first list but didnt
# 9458 - should have picked up both lists but didn't

In [561]:
import random
idx = random.sample(mcf_df.index.tolist(), 1)[0]
#idx = 23272
#idx = 18514
#idx = 835
print(f"Index: {idx}")
print(f"Job title: {mcf_df['Title'][idx]}")
print(f"SSOC: {mcf_df['SSOC_2015'][idx]}")
print(f"Job tasks: {process_text(mcf_df['Description'][idx])}")
print("-----------------------")
print("==============================================")
print(mcf_df['Description'][idx])

Index: 9458
Job title: project coordinator (electrical)
SSOC: 29090
['<p>1. Cordinating and sequencing works in accordance to master and detailed scheduled</p> <p>2. Supervising site works to ensure quality works and completion within schedule</p> <p>3. Supervise on-site technical support in various stages : project execution, &nbsp;attendance of site meetings, testing &nbsp;&nbsp;and commissioning and project handover etc&nbsp;</p> <p>4. Supervise site inspection and to resolve site matters with regards to the project, sub-contractors works and progress of works</p> <p>5. Supervise/train the workers</p>']
[(5, 0.8)]
Paragraph list detected
Job tasks: <p>1. Cordinating and sequencing works in accordance to master and detailed scheduled</p> <p>2. Supervising site works to ensure quality works and completion within schedule</p> <p>3. Supervise on-site technical support in various stages : project execution, &nbsp;attendance of site meetings, testing &nbsp;&nbsp;and commissioning and proj

In [554]:
process_text(mcf_df["Description"][18203])

['<ul>  <li>Able to perform book keeping</li>  <li>Able to prepare documents such as Invoice and Packing list</li>  <li>Able to prepare GST Submission</li>  <li>Able to liaise with forwarder and prepare shipping documents</li>  <li>Able to communicate with clients</li>  <li>Handle general admin tasks such as filing, phone calls and e-mails</li>  <li>Handle any administrative or miscellaneous tasks if required</li>  <li>Accounting background preferred but not required.</li>  <li>Proficient in MS Word and Excel</li>  <li>Able to work independently</li>  <li>Meticulous, motivated, proactive</li>  <li>Responsible and willing to learn</li>  <li>Good communication and interpersonal skills</li>  <li>Able to commence work immediately</li></ul>']
[(14, 0.7142857142857143)]
List object detected


'<ul>  <li>Able to perform book keeping</li>  <li>Able to prepare documents such as Invoice and Packing list</li>  <li>Able to prepare GST Submission</li>  <li>Able to liaise with forwarder and prepare shipping documents</li>  <li>Able to communicate with clients</li>  <li>Handle general admin tasks such as filing, phone calls and e-mails</li>  <li>Handle any administrative or miscellaneous tasks if required</li>  <li>Accounting background preferred but not required.</li>  <li>Proficient in MS Word and Excel</li>  <li>Able to work independently</li>  <li>Meticulous, motivated, proactive</li>  <li>Responsible and willing to learn</li>  <li>Good communication and interpersonal skills</li>  <li>Able to commence work immediately</li></ul>'

In [250]:
mcf_df["Description"][103]

'<ul>\n  <li>Proven software development experience and Android skills development (minimum 3 years)</li>\n  <li>Proven working experience in Android app development (minimum 3 years)</li>\n  <li>Experience with Android SDK</li>\n  <li>Experience working with remote data via REST and JSON</li>\n  <li>Experience with third-party libraries and APIs</li>\n  <li>Working knowledge of the general mobile landscape, architectures, trends, and emerging technologies</li>\n  <li>Solid understanding of the full mobile development life cycle.</li>\n  <li>Proficient understanding of code versioning git</li>\n  <li>Familiarity with continuous integration</li>\n</ul>'

In [156]:
for token in nlp(output[0].split(' ')[0]):
    print(token.pos_)
    print(nlp(output[0].split(' ')[0])[0].pos_ == 'VERB')

NOUN
False


In [157]:
list_elements

['<ul>  <li>Qualified lawyer with a minimum of 3- 5yrs PQE in a common law jurisdiction including England &amp; Wales, Australian qualifications due to the cross border nature of our work</li>  <li>Experience in complex cross border and multi-jurisdicitonal banking work</li>  <li>Strong Technical banking knowledge</li>  <li>Excellent Client Relationships skills</li>  <li>Strong commercial/business acumen with the ability to deliver creative and pragmatic solutions and advice</li>  <li>Excellent communication and interpersonal skills, with the ability to interface at all levels and particularly to win the respect of the partner and fee-earner community, building strong relationships with both internal and external stakeholders</li>  <li>Collaborative team player with the ability to develop and work in a fast paced, intellectually rigorous environment</li></ul>']

In [158]:
non_list_elements

['Allen &amp; Overy is a leading global law firm operating in over thirty countries. Embracing new trends and harnessing decades of experience, we’ve earned our place at the forefront of the legal industry. Our partners are recognised as leaders in their field and our deals and cases often make headline business news. As a result, we’ve developed a reputation for delivering excellence, in all that we do. Today, we’re continuing to break new ground. By embracing new ways of thinking and integrating technology into our everyday work, we’ve been named as the Most Innovative Law Firm in Europe by the Financial Times six times. Join our team and you’ll be part of a flexible, inclusive culture underpinned by openness and acceptance. We’re driven by the belief that, to perform, people need support and space to collaborate. By combining those values with an ambitious outlook, we can give you the opportunity to thrive.Department &amp; RoleAllen &amp; Overy Singapore is a hub for the firms work 

In [38]:
output[output.apply(len) != 0]

196      ([{'job_title': [' ', 'ux', 'designer'], 'job_...
220      ([{'job_title': [' ', 'key', 'member', 'digita...
315      ([{'job_title': ['   ', 'coordinate', 'centre'...
320      ([{'job_title': ['responsibility'], 'job_descr...
456      ([{'job_title': [' ', 'key', 'responsibility',...
                               ...                        
22895    ([{'job_title': ['responsibility'], 'job_descr...
23191    ([{'job_title': [' ', 'human', 'resource'], 'j...
23272    ([{'job_title': [' ', 'expect'], 'job_descript...
23345    ([{'job_title': [' ', 'job', 'responsibility']...
23545    ([{'job_title': [' \xa0'], 'job_description': ...
Name: Description, Length: 317, dtype: object

In [115]:
mcf_df[mcf_df["Description"].str.contains('<ol>')]

Unnamed: 0,Job_ID,Title,Description,SSOC_2015
76,MCF-2020-0221485,business development manager,<p>Genesis-Global is a purveyor of fine luxury...,12212.0
77,MCF-2020-0078824,security officer #sgunitedjobs,<p><u><strong>Duties &amp; Responsibilities</s...,54142
94,MCF-2020-0103552,junior accountant #sgunitedtraineeships,<p><u><strong>Traineeship Description:</strong...,33130
136,MCF-2020-0142738,#sgunitedtraineeships executive (clinician rem...,<p><strong>Job Description:</strong></p>\n<p>T...,29090
160,MCF-2020-0024938,biofire market development manager aspac,<p><strong>Position Summary:</strong></p>\n<p>...,11202
...,...,...,...,...
23516,MCF-2020-0144392,associate (marketing & development) #sgunitedt...,<p><u><strong>Traineeship Description:</strong...,39990
23568,MCF-2020-0124783,founders office trainee associate #sgunitedtra...,<p><strong>Traineeship Description:</strong></...,13499
23569,MCF-2020-0175196,automation software engineer #sgunitedtrainees...,<p>The Analyst Programmer (Automation COE) wil...,25121
23593,MCF-2020-0066592,senior / lead mobile developer,<p>Do you believe your designs and code could ...,29090.0


In [116]:
mcf_df["Description"][76]

'<p>Genesis-Global is a purveyor of fine luxury gems and jewellery with an increasingly global outreach, whilst having business presence in Singapore and Hong Kong for more than 7 years.</p>\n<p>Having consolidated its operational activities into its new head office in Singapore in May 2019 as part of its new focus away from the Hong Kong market, Genesis-Global is now seeking to expand its team in Singapore to capture market share in the luxury gems market in Singapore and Europe, with special focus in expansion into the French luxury market as the initial footprint in Europe.</p>\n<p>With that in mind, the company now wishes to hire, on a full-time basis, a Business Development Manager with specific abilities and experience as more particularly described below, who shall already be physically located in Singapore as at the time of job application. The person should possess the following qualities:</p>\n<ol>\n  <li>At least 3 years of living and working in Singapore.</li>\n  <li>Had be

In [None]:
# to do
# replace <p> tags within <li> tags !
# pick <p> tag with closest match instead of preceding <p> tag only? or preceding 2 <p> tags
# if <p> tag is empty then hunt for another tag (maybe h1?)
# max length of <p> tag?
# need to capture all <li> tags instead of just those with <p> preceding
# should we just go straight to labelling?

In [100]:
mcf_df.iloc[18]['Description']

'<p>Unveiled in 1989, Expressions is a recognized pioneer in the beauty industry. It adopts the philosophy of total wellness, offering a wide range of facial and slimming treatment, as well as spa therapy. All of which are supported by high technology equipment and service-excellence standards.</p>\n<p><br></p>\n<p>Trainee will be stationed in the retail store setting to work with other retail staff and undergo training to nurture skills in retail operation management. Trainee will also be in direct interactions with the customers and be responsible to oversee customer service.</p>\n<p><br></p>\n<p>Responsibilities include, but not limited to:</p>\n<ul>\n  <li>Support the Operations Manager in the supervision of store operations and the management of retail manpower</li>\n  <li>Manage front desk operations (appointment system, point-of-sales system, credit card and digital payment terminals).</li>\n  <li>Attend to customer enquiries and advise customers on the benefits of massages</li>

In [96]:
import random
idx = random.sample(output[output.apply(len) != 0].index.tolist(), 1)[0]
#idx = 23272
#idx = 18514
#idx = 835
print(f"Index: {idx}")
for i in output[idx][0]:
    print(f"Job title: {i['job_title']}")
    print(f"Relevance score: {i['relevance_score']}")
    print(f"Job description: {i['job_description']}")
    print(f"Match: {i['match']}")
    print("-----------------------")
print("==============================================")
print(mcf_df.iloc[idx]['Description'])

Index: 6535
Job title: [' \xa0\xa0\xa0\xa0\xa0\xa0 ', 'job', 'responsibility']
Relevance score: (0.8285877897785132, 0.6670268372784524)
Job description:   support requests for data or data related collaboration or projects, and ensure that all deliverables and outcomes are accurate and timely; manage day-day data administration and management issues; expected to present imd and ica in data-driven collaborations/projects within ica and across wog respectively; assist in the project management of ica’s analytic projects and initiatives; manage the entire end to end lifecycle of an analytics project, including scoping of data inputs, data preparation, data pre-processing, feature engineering, basic data modelling, model deployment and monitoring; ensure projects adhere to ica’s data management framework, data quality and protection standards pertaining to ica’s information; responsible for the management of ica’s data resources; perform all other duties as and when assigned by the superv

In [52]:
output[320]

([{'job_title': ['responsibility'],
   'job_description': '  strong background in computer science, computer or electronics engineering, information technology or related technical discipline experience/knowledge in front-end and backend languages and libraries (e.g. html/ css, javascript, xml, jquery, java) experience in modern web application technology stack such as node.js, react.js, spring familiar with version-control software (e.g git, serena dimensions) familiar with best practices, such as tdd and ci/cd experience in an agile environment ',
   'match': '<p><strong><u>Responsibilities</u></strong></p> <ul> <li>Strong background in Computer Science, Computer or Electronics Engineering, Information Technology or related technical discipline</li> <li>Experience/Knowledge in front-end and backend languages and libraries (e.g. HTML/ CSS, JavaScript, XML, jQuery, Java)</li> <li>Experience in modern web application technology stack such as Node.js, React.js, Spring</li> <li>Familiar w