In [25]:
import pandas as pd
import re
import spacy
import string
nlp = spacy.load('en_core_web_lg')

In [None]:
mcf_df = pd.read_csv("..\Data\Processed\WGS_Dataset_JobInfo_precleaned.csv")
mcf_df = mcf_df[["Job_ID", "Title", "Description", "SSOC_2015"]].sample(frac=0.1, random_state=1).reset_index(drop = True)
#mcf_df.to_csv("..\Data\Processed\Artifacts\Raw_Text.csv", index=False)

In [59]:
def remove_html_tags_newline(text):
    """
    Removes HTML and newline tags from a string with generic regex

    Parameters:
        text (str): Selected text

    Returns:
        cleaned_text(text) : Text with html tags and new line removed
    """

    clean = re.compile('<.*?>')
    newline_clean = re.compile('\n')
    return re.sub(newline_clean, ' ', re.sub(clean, '', text)).lower()

def lemmatize_remove_punct(doc):
    '''
    Take the `token.lemma_` of each non-stop word
    '''
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

def max_similarity(lst, word):
    return nlp(' '.join(lst)).similarity(nlp(word))

# Extracting based on ul tag to get title and description
def extracting_job_desc_ultag(text):
    """
    Extract job description using specific header text

    Parameters:
        text (str): Selected text

    Returns:
        list_extracted_text(list[text]): Extracted text
    """

    # Extract all lists in the HTML with a preceding <p> tag
    pattern = re.compile(r'(?=<p>).*?(?<=</p>).*?(?=<ol>|<ul>).*?(?<=</ol>|</ul>)')
    matches = pattern.findall(text)

    # Check if there are any matches at all
    # If none, we return an empty list
    if len(matches) == 0:
        return []

    # Initialise an empty list to store our cleaned texts
    cleaned_texts = []

    # For each regex match
    for match in matches:

        # Split the entire HTML string on the closing </p> tag
        texts = match.split(r'</p>')

        # Take the last 2 items: second-last item is the <p> element
        # directly preceding the list, and the last item is the
        # HTML list that we are interested in.
        cleaned_texts.append({
            # Note: should merge the functions for lemmatisation and removing HTML into a single function with multiple options
            'job_title': lemmatize_remove_punct(nlp(remove_html_tags_newline(texts[-2]))),
            'job_description': remove_html_tags_newline(texts[-1]),
            'match': match
        })

    # Iterate over each cleaned text to generate the similarity score
    # Note: Should be merged in with the previous iteration
    for cleaned_text in cleaned_texts:
        relevance_score = max_similarity(cleaned_text['job_title'], "description duty responsibility role expect")
        irrelevance_score = max_similarity(cleaned_text['job_title'], "possess quality requirement skills ideal candidate")

        # max dissimilarity?
        cleaned_text['relevance_score'] = (relevance_score, irrelevance_score)

    # Create a list with all the relevance scores and find the
    # index of the top relevance scores
    relevance_scores_all = [cleaned_text['relevance_score'] for cleaned_text in cleaned_texts]
    most_relevant_text_idx = relevance_scores_all.index(max(relevance_scores_all))

    return cleaned_texts, cleaned_texts[most_relevant_text_idx]


In [60]:
output = mcf_df["Description"].apply(extracting_job_desc_ultag)

  return nlp(' '.join(lst)).similarity(nlp(word))


In [31]:
output[output.apply(len) != 0]

196      ([{'job_title': [' ', 'ux', 'designer'], 'job_...
220      ([{'job_title': [' ', 'key', 'member', 'digita...
315      ([{'job_title': ['   ', 'coordinate', 'centre'...
320      ([{'job_title': ['responsibility'], 'job_descr...
456      ([{'job_title': [' ', 'key', 'responsibility',...
                               ...                        
22895    ([{'job_title': ['responsibility'], 'job_descr...
23191    ([{'job_title': [' ', 'human', 'resource'], 'j...
23272    ([{'job_title': [' ', 'expect'], 'job_descript...
23345    ([{'job_title': [' ', 'job', 'responsibility']...
23545    ([{'job_title': [' \xa0'], 'job_description': ...
Name: Description, Length: 317, dtype: object

In [None]:
# to do
# replace <p> tags within <li> tags !
# pick <p> tag with closest match instead of preceding <p> tag only? or preceding 2 <p> tags
# if <p> tag is empty then hunt for another tag (maybe h1?)
# max length of <p> tag?
# need to capture all <li> tags instead of just those with <p> preceding
# should we just go straight to labelling?

In [70]:
import random
idx = random.sample(output[output.apply(len) != 0].index.tolist(), 1)[0]
#idx = 23272
#idx = 18514
#idx = 835
print(f"Index: {idx}")
for i in output[idx][0]:
    print(f"Job title: {i['job_title']}")
    print(f"Relevance score: {i['relevance_score']}")
    print(f"Job description: {i['job_description']}")
    print(f"Match: {i['match']}")
    print("-----------------------")
print("==============================================")
print(mcf_df.iloc[idx]['Description'])

Index: 7812
Job title: [' ', 'role', 'require']
Relevance score: (0.8131860515948719, 0.7435575268310051)
Job description:   recommend and implement communication strategies to support organisation development and initiatives conceptualise and execute corporate communications plans through an array of communication platforms and channels including media, social media and event publicity, online and print publications, with a focus on visual communication as key enabler create engaging content through communication platforms ensure consistent messaging across multiple platforms plan and implement publicity efforts for project milestones and events handle media requests and queries 
Match: <p>At DSTA, we develop leading-edge technological and engineering solutions for the defence and security of our nation. Look forward to comprehensive professional and personal development programmes, and grow in a collaborative and dynamic environment. Join us in the engineering, IT or the corporate do

In [52]:
output[320]

([{'job_title': ['responsibility'],
   'job_description': '  strong background in computer science, computer or electronics engineering, information technology or related technical discipline experience/knowledge in front-end and backend languages and libraries (e.g. html/ css, javascript, xml, jquery, java) experience in modern web application technology stack such as node.js, react.js, spring familiar with version-control software (e.g git, serena dimensions) familiar with best practices, such as tdd and ci/cd experience in an agile environment ',
   'match': '<p><strong><u>Responsibilities</u></strong></p> <ul> <li>Strong background in Computer Science, Computer or Electronics Engineering, Information Technology or related technical discipline</li> <li>Experience/Knowledge in front-end and backend languages and libraries (e.g. HTML/ CSS, JavaScript, XML, jQuery, Java)</li> <li>Experience in modern web application technology stack such as Node.js, React.js, Spring</li> <li>Familiar w