# Constructing the pre-training data
1. Import Libraries 
2. Import previously extracted data
3. Extract the actual matches
4. Align the matches with the tokens 

In [None]:
#Import Libraries
import pandas as pd
import re
import numpy as np
import tqdm
from nltk.corpus import stopwords
import spacy
import json 
import os

In [None]:
#Import the ealier extracted and matched datasets
path = "./Extracted_competencies/extracted.csv"
extracted_matched = pd.read_csv(path)

In [None]:
#Remove commas (to ensure clear matchign between lemmatized and unprocessed sentences)
extracted_matched['Sentence'] = extracted_matched['Sentence'].apply(lambda x: x.replace(',', ''))
extracted_matched['Sentence_lemmatized'] = extracted_matched['Sentence_lemmatized'].apply(lambda x: x.replace(',', ''))

In [None]:
#This function is used to extract the actual matched entities for each competence
def separate_matches(df):
    #Identify skill and knowledge columns
    skill_pattern = re.compile(r'^skill_(\d+)$')
    knowledge_pattern = re.compile(r'^knowledge_(\d+)$')

    #Separate skill and knowledge columns
    skill_cols = [col for col in df.columns if skill_pattern.match(col)]
    skill_fragment_cols = [col for col in df.columns if col.startswith('skill_') and col.endswith('_fragments')]
    knowledge_cols = [col for col in df.columns if knowledge_pattern.match(col)]
    knowledge_fragment_cols = [col for col in df.columns if col.startswith('knowledge_') and col.endswith('_fragments')]

    #Define base columns (not skill or knowledge)
    base_cols = [col for col in df.columns if col not in skill_cols + skill_fragment_cols + knowledge_cols + knowledge_fragment_cols]

    exploded_rows = []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        base_data = row[base_cols]

        #Flags to check if competences are present
        has_skill = False
        has_knowledge = False

        #Skills
        for skill_col in skill_cols:
            skill_value = row[skill_col]
            if pd.notna(skill_value):
                has_skill = True
                skill_num = skill_col.split('_')[1]
                fragment_col = f'skill_{skill_num}_fragments'
                fragment_value = row.get(fragment_col, pd.NA)
                new_row = base_data.copy()
                new_row['skill'] = skill_value
                new_row['skill_fragments'] = fragment_value
                new_row['knowledge'] = pd.NA
                new_row['knowledge_fragments'] = pd.NA
                exploded_rows.append(new_row)

        #Knowledge
        for knowledge_col in knowledge_cols:
            knowledge_value = row[knowledge_col]
            if pd.notna(knowledge_value):
                has_knowledge = True
                knowledge_num = knowledge_col.split('_')[1]
                fragment_col = f'knowledge_{knowledge_num}_fragments'
                fragment_value = row.get(fragment_col, pd.NA)
                new_row = base_data.copy()
                new_row['knowledge'] = knowledge_value
                new_row['knowledge_fragments'] = fragment_value
                new_row['skill'] = pd.NA
                new_row['skill_fragments'] = pd.NA
                exploded_rows.append(new_row)

        #For the unmatched examples
        if not has_skill and not has_knowledge:
            new_row = base_data.copy()
            new_row['skill'] = pd.NA
            new_row['skill_fragments'] = pd.NA
            new_row['knowledge'] = pd.NA
            new_row['knowledge_fragments'] = pd.NA
            exploded_rows.append(new_row)

    exploded_df = pd.DataFrame(exploded_rows)

    #Reorder columns to match the desired order
    columns_order = base_cols + ['skill', 'skill_fragments', 'knowledge', 'knowledge_fragments']
    exploded_df = exploded_df[columns_order]
    exploded_df.reset_index(drop=True, inplace=True)

    return exploded_df

In [None]:
#Get the stopwords again (to ensure these are marked as relevant if appear within the orginal match)
stopset_r = set(stopwords.words('english'))
stopset_r.update(['on','approximately', 'approximately', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', ':', 'love', 'please', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'overtime', '()', '( )', 'bonus',' bonuses', 'Â£' , '?', 'be', 'aberdeen', 'armagh', 'bangor', 'bangor', 'bath', 'belfast', 'birmingham', 'bradford', 'brighton hove', 'bristol', 'cambridge', 'canterbury', 'cardiff', 'carlisle', 'chelmsford', 'chester', 'chichester', 'colchester', 'coventry', 'derby', 'doncaster', 'dundee', 'dunfermline', 'durham', 'edinburgh', 'ely', 'exeter', 'glasgow', 'gloucester', 'hereford', 'inverness', 'kingston upon hull', 'lancaster', 'leeds', 'leicester', 'lichfield', 'lincoln', 'lisburn', 'liverpool', 'london', 'londonderry', 'manchester', 'milton keynes', 'newcastle upon tyne', 'newport', 'newry', 'norwich', 'nottingham', 'oxford', 'perth', 'peterborough', 'plymouth', 'portsmouth', 'preston', 'ripon', 'salford', 'salisbury', 'sheffield', 'southampton', 'southend - on - sea', 'st albans', 'st asaph ', 'llanelwy', 'st davids', 'stirling', 'stoke-on-trent', 'sunderland', 'swansea', 'truro', 'wakefield', 'wells', 'westminster', 'winchester', 'wolverhampton', 'worcester', 'wrexham', 'york', 'bedfordshire', 'berkshire', 'bristol', 'buckinghamshire', 'cambridgeshire', 'cheshire', 'cornwall', 'cumbria', 'derbyshire', 'devon', 'dorset', 'durham', 'east riding of yorkshire', 'east sussex', 'essex', 'gloucestershire', 'greater london', 'greater manchester', 'hampshire', 'herefordshire', 'hertfordshire', 'isle of wight', 'kent', 'lancashire', 'leicestershire', 'lincolnshire', 'merseyside', 'middlesex', 'norfolk', 'north yorkshire', 'northamptonshire', 'northumberland', 'nottinghamshire', 'oxfordshire', 'rutland', 'shropshire', 'somerset', 'south yorkshire', 'staffordshire', 'suffolk', 'surrey', 'tyne and wear', 'warwickshire', 'west midlands', 'west sussex', 'west yorkshire', 'wiltshire', 'worcestershire', 'county antrim', 'county armagh', 'county down', 'county fermanagh', 'county londonderry', 'county tyrone', 'aberdeen', 'aberdeenshire', 'angus', 'argyll and bute', 'clackmannanshire', 'dumfries and galloway', 'dundee', 'east ayrshire', 'east dunbartonshire', 'east lothian', 'east renfrewshire', 'edinburgh', 'falkirk', 'fife', 'glasgow', 'highland', 'inverclyde', 'midlothian', 'moray', 'north ayrshire', 'north lanarkshire', 'orkney', 'perth and kinross', 'renfrewshire', 'scottish borders', 'shetland isles', 'south ayrshire', 'south lanarkshire', 'stirlingshire', 'west dunbartonshire', 'west lothian', 'western isles', 'anglesey / sir fon', 'anglesey/sir fon', 'blaenau gwent', 'bridgend', 'caerphilly', 'cardiff', 'carmarthenshire', 'ceredigion', 'conwy', 'denbighshire', 'flintshire', 'glamorgan', 'gwynedd', 'merthyr tydfil', 'monmouthshire', 'neath port talbot', 'newport', 'newport city', 'pembrokeshire', 'powys', 'rhondda cynon taff', 'swansea', 'torfaen', 'wrexha', 'It', 'iT', 'it'])
stopset_r.remove('during')
stopset_r.remove('about')
stopset_r.remove('other')
stopset_r.remove('off')
stopset_r.remove('from')
stopset_r.remove('down')
stopset_r.remove('under')
stopset_r.remove('over')
stopset_r.remove('own')

In [None]:
#Function to tokenize the sentences
lemmatizer_nlp = spacy.load('en_core_web_lg', exclude=['ner', 'parser', 'textcat']) 

def spacy_tokenize_text(text):

    doc = lemmatizer_nlp(text)
    
    tokenz = [token.text for token in doc]

    return tokenz

def tokenize_relevant(df):
    #Ensure 'Sentence' column is of string type
    df['Sentence'] = df['Sentence'].astype(str)
    df['Sentence_lemmatized'] = df['Sentence_lemmatized'].astype(str)
    df['Tokens'] = df['Tokens'].astype(str)
    #Initialize an empty list to store the tokenized sentences
    df['Sentence_tokens'] = df['Sentence'].apply(spacy_tokenize_text)
    df['Sentence_lemmatized_tokens'] = df['Sentence_lemmatized'].apply(spacy_tokenize_text)
    df['Tokens_tokens'] = df['Tokens'].apply(spacy_tokenize_text)
    
    return df

In [None]:
# The following functions are used to match the initally extracted matches (lemmatized) with the orignal text in the sentence data
def annotate_fragments(df, stop_words_set):
    #Defien stopwords for lookup
    stop_words_set = set(stop_words_set)

    annotations = []
    token_masks = []
    unmatched_rows = [] 

    for _, row in df.iterrows():
        sentence_lemmatized_tokens = row['Sentence_lemmatized_tokens']
        skill_fragments = row.get('skill_fragments', None)
        knowledge_fragments = row.get('knowledge_fragments', None)
        
        #Initalize mask of 0s for all tokens in a sequence 
        mask = [0] * len(sentence_lemmatized_tokens)
        
        fragments_list = []
        if pd.notna(skill_fragments) and skill_fragments.strip():
            fragments_list.append(('skill_fragments', skill_fragments))
        if pd.notna(knowledge_fragments) and knowledge_fragments.strip():
            fragments_list.append(('knowledge_fragments', knowledge_fragments))
        
        row_annotations = []
        unmatched_fragments = []  
        
        for frag_type, fragments in fragments_list:
            #multiple matches are split by ";" 
            fragment_groups = [fg.strip() for fg in fragments.split(';') if fg.strip()]
            for fragment_group in fragment_groups:
                # Matches within the single match are split by ","
                fragment_items = [fi.strip() for fi in fragment_group.split(',') if fi.strip()]
                for fragment in fragment_items:
                    fragment_words = fragment.lower().split()
                    if not fragment_words:
                        continue 
                    matched_positions_list = match_fragment_in_lemmatized_tokens(
                        fragment_words, sentence_lemmatized_tokens, stop_words_set
                    )
                    if matched_positions_list:
                        for positions in matched_positions_list:
                            #UIpdate the mask accordingly (lemmatized and raw sentences have the same token structure)
                            for pos in positions:
                                mask[pos] = 1
                            matched_tokens = [sentence_lemmatized_tokens[pos] for pos in positions]
                            annotation = {
                                'fragment': fragment,
                                'matched_tokens': matched_tokens,
                                'positions': positions
                            }
                            row_annotations.append(annotation)
                    else:
                        #record the unmatched fragments 
                        unmatched_fragments.append({
                            'fragment_type': frag_type,
                            'fragment': fragment
                        })
        annotations.append(row_annotations)
        token_masks.append(mask)
        
        #store the unmatched examples
        if unmatched_fragments:
            error_row = row.copy()
            error_row['unmatched_fragments'] = unmatched_fragments
            unmatched_rows.append(error_row)

    df['Annotations'] = annotations
    df['Token_Mask'] = token_masks
    cols_to_keep = ['index', 'JobID', 'Sentence', 'Sentence_tokens', 'skill', 'knowledge', 'Annotations', 'Token_Mask']
    df = df[cols_to_keep]

    #return the "error" dataframe for unmatche examples 
    error_match_df = pd.DataFrame(unmatched_rows)
    return df, error_match_df

def match_fragment_in_lemmatized_tokens(fragment_words, lemmatized_tokens, stop_words):
    """
    Match fragment words in lemmatized tokens, allowing stopwords between words.

    Parameters:
    - fragment_words: List of words in the fragment (lowercased).
    - lemmatized_tokens: List of lemmatized tokens from the sentence (including stopwords).
    - stop_words: Set of stopwords.

    Returns:
    - List of positions in lemmatized_tokens where the fragment words occur.
    """
    if not fragment_words:
        return [] 

    matched_positions_list = []
    tokens_len = len(lemmatized_tokens)
    fragment_len = len(fragment_words)
    
    for i in range(tokens_len):
        #Start with matching the first word (if possible)
        if lemmatized_tokens[i].lower() == fragment_words[0]:
            positions = [i]
            fragment_idx = 1
            token_idx = i + 1
            while fragment_idx < fragment_len and token_idx < tokens_len:
                token_lower = lemmatized_tokens[token_idx].lower()
                if token_lower == fragment_words[fragment_idx]:
                    #Match next word in fragment
                    positions.append(token_idx)
                    fragment_idx += 1
                    token_idx += 1
                elif token_lower in stop_words:
                    #Include stopword and continue
                    positions.append(token_idx)
                    token_idx += 1
                else:
                    break
            if fragment_idx == fragment_len:
                matched_positions_list.append(positions)
    return matched_positions_list


In [None]:
#This function transforms the dataset into a desired list of dictioanires that can be used with the skill extraction model
#The purpose of this function is to create irrelevant masks for each sentence. However, there might be cases where a single sentence was matched with mutiple competencies. In that case we need to account for that "additional" relevant tokens when cosntructing a irrelevant mask. 
def transform_dataset(df):
    irrelevant_masks = {}

    #Step 1: First, group by 'index' and aggregate the relevant masks
    for idx, group in df.groupby('index'):
        merged_mask = np.zeros(len(group.iloc[0]['Token_Mask']), dtype=int)
        for token_mask in group['Token_Mask']:
            merged_mask = np.maximum(merged_mask, token_mask).astype(int).tolist()
        irrelevant_masks[idx] = merged_mask

    #Step 2: Iterate over the rows and create the list of dictionaries
    result_list = []
    for _, row in df.iterrows():
        competence = row['skill'] if pd.notna(row['skill']) else row['knowledge']
        competence_class = 's' if pd.notna(row['skill']) else 'k'

        result_dict = {
            'index': row['index'],
            'JobID': row['JobID'],
            'Tokens': row['Sentence_tokens'],
            'competence': competence,
            'competence_class': competence_class,
            'relevant_mask': row['Token_Mask'],
            'irrelevant_mask_total': irrelevant_masks[row['index']]
        }
        result_list.append(result_dict)

    return result_list

In [None]:
#Tokenize the dataset
exploded_matched_tokenized = tokenize_relevant(extracted_matched) 

In [None]:
#Annotate the dataset
exploded_matched_tokenized, annotated_wrong_matched = annotate_fragments(exploded_matched_tokenized, stopset_r)

In [None]:
#Inspect incorrectly matched examples (and remove if neccessary)
wrong_index_matched = [i for i in annotated_wrong_matched.index]

#Drop incorrectly annotated for matched examples
exploded_matched_tokenized = exploded_matched_tokenized.drop(wrong_index_matched)
exploded_matched_tokenized = exploded_matched_tokenized.reset_index()

In [None]:
#Transform into the list of dictionaries 
matched_dict = transform_dataset(exploded_matched_tokenized)

path_to_save = "./Extracted_competencies"
# Assuming result_list is your list of dictionaries
with open(path_to_save+'/matched_dict.json', 'w') as json_file:
    json.dump(matched_dict, json_file, indent=4) 