# Matching Competencies with Job Description data
1. Import necessary Libraries
2. Import your sentence data from job descriptions
3. Lemmatize the dataset
4. Remove the stopwords
5. Extract the competencies

In [43]:
#Import Libraries
import pandas as pd
import os
import regex as re
import spacy 
import datetime
import nltk
from nltk.corpus import stopwords
from tqdm import tqdm
import time as time
from nltk.tokenize import word_tokenize

In [44]:
#Import your job postings data. It must be in a pandas dataframe format. "id" is a unique ID, "Sentence" are the sentences from job posting data and "Class" determines whether the sentence is relevant (i.e., contains competencies) or not-relevant. 
#Example: 
data = {
    "id": [1, 2, 3, 4],
    "Sentence": [
        "Proficiency in Python, SQL, and data visualization tools such as Tableau or Power BI.",
        "Strong analytical and problem-solving skills, with a focus on detail.",
        "Knowledge of cloud computing platforms such as AWS or Google Cloud.",
        "Embrace and uphold the values and ethos of the company.",
    ],
    "Class": [1, 1, 1, 0],
}

example_df = pd.DataFrame(data)

In [45]:
#lemmatize your data
lemmatizer_nlp = spacy.load('en_core_web_lg', exclude=['ner', 'parser'])

#Replacement dictionary to handle the "datum" to "data" issue
rep_dict_2 = {"datum": "data", "Datum": "Data", "DATUM": "DATA"}

def spacylemm_lang(text):
    #Apply lemmatization using the lemmatizer model
    lemmatizer_sentence = lemmatizer_nlp(text)
    
    lemmatized_tokens = []
    for token in lemmatizer_sentence:
        lemma = token.lemma_
        # Apply the replacement dictionary for special cases
        lemma = rep_dict_2.get(lemma, lemma)
        lemmatized_tokens.append(lemma)
    
    # Join the lemmatized tokens back into a sentence
    lemmatized_sentence = " ".join(lemmatized_tokens)

    return lemmatized_sentence

In [46]:
#Define stopwords
stopset_r = set(stopwords.words('english'))
stopset_r.update(['on','approximately', 'approximately', 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december', ':', 'love', 'please', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday', 'overtime', '()', '( )', 'bonus',' bonuses', '£' , '?', 'be', 'aberdeen', 'armagh', 'bangor', 'bangor', 'bath', 'belfast', 'birmingham', 'bradford', 'brighton hove', 'bristol', 'cambridge', 'canterbury', 'cardiff', 'carlisle', 'chelmsford', 'chester', 'chichester', 'colchester', 'coventry', 'derby', 'doncaster', 'dundee', 'dunfermline', 'durham', 'edinburgh', 'ely', 'exeter', 'glasgow', 'gloucester', 'hereford', 'inverness', 'kingston upon hull', 'lancaster', 'leeds', 'leicester', 'lichfield', 'lincoln', 'lisburn', 'liverpool', 'london', 'londonderry', 'manchester', 'milton keynes', 'newcastle upon tyne', 'newport', 'newry', 'norwich', 'nottingham', 'oxford', 'perth', 'peterborough', 'plymouth', 'portsmouth', 'preston', 'ripon', 'salford', 'salisbury', 'sheffield', 'southampton', 'southend - on - sea', 'st albans', 'st asaph ', 'llanelwy', 'st davids', 'stirling', 'stoke-on-trent', 'sunderland', 'swansea', 'truro', 'wakefield', 'wells', 'westminster', 'winchester', 'wolverhampton', 'worcester', 'wrexham', 'york', 'bedfordshire', 'berkshire', 'bristol', 'buckinghamshire', 'cambridgeshire', 'cheshire', 'cornwall', 'cumbria', 'derbyshire', 'devon', 'dorset', 'durham', 'east riding of yorkshire', 'east sussex', 'essex', 'gloucestershire', 'greater london', 'greater manchester', 'hampshire', 'herefordshire', 'hertfordshire', 'isle of wight', 'kent', 'lancashire', 'leicestershire', 'lincolnshire', 'merseyside', 'middlesex', 'norfolk', 'north yorkshire', 'northamptonshire', 'northumberland', 'nottinghamshire', 'oxfordshire', 'rutland', 'shropshire', 'somerset', 'south yorkshire', 'staffordshire', 'suffolk', 'surrey', 'tyne and wear', 'warwickshire', 'west midlands', 'west sussex', 'west yorkshire', 'wiltshire', 'worcestershire', 'county antrim', 'county armagh', 'county down', 'county fermanagh', 'county londonderry', 'county tyrone', 'aberdeen', 'aberdeenshire', 'angus', 'argyll and bute', 'clackmannanshire', 'dumfries and galloway', 'dundee', 'east ayrshire', 'east dunbartonshire', 'east lothian', 'east renfrewshire', 'edinburgh', 'falkirk', 'fife', 'glasgow', 'highland', 'inverclyde', 'midlothian', 'moray', 'north ayrshire', 'north lanarkshire', 'orkney', 'perth and kinross', 'renfrewshire', 'scottish borders', 'shetland isles', 'south ayrshire', 'south lanarkshire', 'stirlingshire', 'west dunbartonshire', 'west lothian', 'western isles', 'anglesey / sir fon', 'anglesey/sir fon', 'blaenau gwent', 'bridgend', 'caerphilly', 'cardiff', 'carmarthenshire', 'ceredigion', 'conwy', 'denbighshire', 'flintshire', 'glamorgan', 'gwynedd', 'merthyr tydfil', 'monmouthshire', 'neath port talbot', 'newport', 'newport city', 'pembrokeshire', 'powys', 'rhondda cynon taff', 'swansea', 'torfaen', 'wrexha', 'It', 'iT', 'it'])
stopset_r.remove('during')
stopset_r.remove('about')
stopset_r.remove('other')
stopset_r.remove('off')
stopset_r.remove('from')
stopset_r.remove('down')
stopset_r.remove('under')
stopset_r.remove('over')
stopset_r.remove('own')

In [47]:
#Function to remove stopwords
def stopwords_rem(text):
    stopwords = stopset_r
    full_text_ver_0 = re.sub(r'\bit\b|\bIt\b|\biT\b', '', text)
    full_text = full_text_ver_0.lower()
    words_st_0 = re.sub(r'[^\w\s\/\-\.:\+#]', '', full_text)
    words_st_1 = re.sub(r'(?<!C )(?<!C)(?<!C\+)(?<!C\+ )(?<!C \+)(?<!C \+ )(?<!c )(?<!c)(?<!c\+)(?<!c\+ )(?<!c \+)(?<!c \+ )\+', '', words_st_0)
    words_st_2 = re.sub(r'(?<!C )(?<!C)(?<!C\+)(?<!C\+ )(?<!C \+)(?<!C \+ )(?<!c )(?<!c)(?<!c\+)(?<!c\+ )(?<!c \+)(?<!c \+ )\#', '', words_st_1).split()
    tokens = []
    for token in words_st_2:
        if token not in stopwords:
            tokens.append(token)
    return " ".join(tokens)

def stopwords_remove(new_dataframe_1):

    new_dataframe_2 = pd.DataFrame()
    
    new_dataframe_2['Tokens'] = new_dataframe_1['Sentence_lemmatized'].apply(lambda x: stopwords_rem(x))
    new_dataframe_2 = pd.concat([new_dataframe_1, new_dataframe_2], axis=1, join='outer')
    
    new_dataframe_2 = new_dataframe_2.reset_index(drop=True)
    
    return new_dataframe_2

In [48]:
#Lemmatize data
def clean_text(data): 
    data[['Sentence_lemmatized']] = data['Sentence'].apply(lambda x: pd.Series(spacylemm_lang(x)))
    return data 

example_total = clean_text(example_df)

In [49]:
#Remove stopwords
example_total = stopwords_remove(example_total)

In [50]:
#Import the matching function
import sys
path_to_your_matching_rules = "ContrastSkill/Rules"
path = r'C:\Users\olekb\OneDrive\Desktop\Codes\Rules\extraction_rules.py'
sys.path.append(path_to_your_matching_rules)

from extraction_rules import knowledge, skill, join_lists

In [52]:
#Extract the matches
skills = skill(example_total)
all = knowledge(skills)

#Save your extracted matches
path_to_save = "ContrtastSkill/Data/Extracted_competencies"
path_to_save = r'C:\Users\olekb\OneDrive\Desktop\Codes\Data\Extracted_competencies'
os.makedirs(path_to_save, exist_ok=True)
all.to_csv(os.path.join(path_to_save, 'extracted.csv'), index = None, header=True)

Extracting Skills...: 100%|██████████| 4/4 [00:02<00:00,  1.74it/s]
Extracting Knowledge...: 100%|██████████| 4/4 [00:01<00:00,  3.26it/s]
