Modules

In [1]:
import pandas as pd
import pandas as pd
from rapidfuzz import fuzz, process  # Fuzzy matching for flexibility
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import wordnet
from nltk import pos_tag


In [None]:
# Download all necessary NLTK resources
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')  # NEW - required for POS tagging

# Initialize lemmatizer and stopword list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nick_\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nick_\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nick_\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\nick_\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


Important Functions

In [30]:
# Helper to convert NLTK POS to WordNet POS
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # fallback

# Updated preprocess_text function
def preprocess_text(text):
    if not isinstance(text, str):
        return ""

    words = text.lower().split()
    words = [word for word in words if word not in stop_words]
    tagged_words = pos_tag(words)

    lemmatized_words = []
    for word, pos in tagged_words:
        wn_pos = get_wordnet_pos(pos)
        lemma = lemmatizer.lemmatize(word, wn_pos)
        if lemma == word and wn_pos != wordnet.VERB:
            # Retry as verb if unchanged
            lemma = lemmatizer.lemmatize(word, pos='v')
        lemmatized_words.append(lemma)

    return " ".join(lemmatized_words)

def remove_bracketed_words(text):
    if not isinstance(text, str):
        return ""
    
    # Remove all content inside brackets (including brackets)
    cleaned_text = re.sub(r'\s*\([^)]*\)', '', text).strip()
    
    return cleaned_text


# Function to find synonyms using WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name().replace("_", " "))  # Convert underscores to spaces
    return list(synonyms)



Flattening the ESCO Data Set

In [31]:
# Load the dataset
file_path = "ESCO skill taxonomy dataset.csv"  # Update with your actual file path
df = pd.read_csv(file_path)

# Columns containing skills
skill_columns = [
    "Essential Skills (Skill)",
    "Essential Skills (Knowledge)",
    "Optional Skills (Skill)",
    "Optional Skills (Knowledge)"
]

# Ensure missing values are handled
df[skill_columns] = df[skill_columns].fillna("")

# Split comma-separated skills into lists
for col in skill_columns:
    df[col] = df[col].apply(lambda x: x.split(", ") if isinstance(x, str) else [])

# Explode each skill column separately and merge back
esco_taxonomy_df = df.copy()
for col in skill_columns:
    esco_taxonomy_df = esco_taxonomy_df.explode(col)

# Rename columns for clarity
esco_taxonomy_df = esco_taxonomy_df.rename(columns={
    "Occupation Title": "Job Role",
    "ESCO Code": "ESCO Code",
    "Description": "Job Description",
    "Alternative Labels": "Alternative Titles",
    "Essential Skills (Skill)": "Essential Skill",
    "Essential Skills (Knowledge)": "Essential Knowledge",
    "Optional Skills (Skill)": "Optional Skill",
    "Optional Skills (Knowledge)": "Optional Knowledge"
})



In [32]:
columns_to_preprocess = [
    "Essential Skill",
    "Essential Knowledge",
    "Optional Skill",
    "Optional Knowledge"
]

for col in columns_to_preprocess:
    unique_values = esco_taxonomy_df[col].dropna().unique()
    processed_map = {val: preprocess_text(val) for val in unique_values}
    esco_taxonomy_df[col] = esco_taxonomy_df[col].map(processed_map)

esco_taxonomy_df["Essential Skill"] = esco_taxonomy_df["Essential Skill"].apply(remove_bracketed_words)
esco_taxonomy_df["Essential Knowledge"] = esco_taxonomy_df["Essential Knowledge"].apply(remove_bracketed_words)
esco_taxonomy_df["Optional Skill"] = esco_taxonomy_df["Optional Skill"].apply(remove_bracketed_words)
esco_taxonomy_df["Optional Knowledge"] = esco_taxonomy_df["Optional Knowledge"].apply(remove_bracketed_words)


Matching skills to jobs

In [33]:
# Create a set of all valid skills for fuzzy matching
all_skills = set(
    esco_taxonomy_df["Essential Skill"].dropna().tolist() +
    esco_taxonomy_df["Essential Knowledge"].dropna().tolist() +
    esco_taxonomy_df["Optional Skill"].dropna().tolist() +
    esco_taxonomy_df["Optional Knowledge"].dropna().tolist()
)

essential_skills = set(    
    esco_taxonomy_df["Essential Skill"].dropna().tolist() +
    esco_taxonomy_df["Essential Knowledge"].dropna().tolist()
    )

optional_skills = set(    
    esco_taxonomy_df["Optional Skill"].dropna().tolist() +
    esco_taxonomy_df["Optional Knowledge"].dropna().tolist()
)

In [96]:
# Load Technology Tool Mapping
tech_tools_df = pd.read_csv("tools_grouped.csv")  # File containing tool-category mappings
tool_to_category = {}

for _, row in tech_tools_df.iterrows():
    raw_category = row["Technology Tool"]
    raw_tools = row["Technology Tool Example"].split(", ")

    # Preprocess the category name once
    category = preprocess_text(raw_category)

    for tool in raw_tools:
        preprocessed_tool = preprocess_text(tool)
        tool_to_category[preprocessed_tool] = category

def map_skills_to_matched_tools(skill, threshold=85):
    """
    Matches user-entered skill to either a tool or tool category based on fuzzy score.
    Returns the best-matched tool category.
    """
    tool_names = list(tool_to_category.keys())
    tool_categories = list(set(tool_to_category.values()))
    skill_lower = preprocess_text(skill)

    # Fuzzy match with tool names
    match_result_tool = process.extractOne(skill_lower, tool_names, scorer=fuzz.WRatio)
    tool_match, tool_score = match_result_tool[:2] if match_result_tool else (None, 0)
    tool_category = tool_to_category.get(tool_match) if tool_match else None

    # Fuzzy match with category names
    match_result_category = process.extractOne(skill_lower, tool_categories, scorer=fuzz.WRatio)
    category_match, category_score = match_result_category[:2] if match_result_category else (None, 0)

    # Decide based on best score
    if tool_score >= threshold and (tool_score > category_score):
        print(f"🔁 '{skill}' matched to tool '{tool_match}' → category '{tool_category}' (score: {tool_score})")
        return tool_category
    elif category_score >= threshold:
        print(f"🔁 '{skill}' matched directly to category '{category_match}' (score: {category_score})")
        return category_match

    # No good match
    return None



# Main Matching Function
def match_jobs(user_skills, threshold): 
    """
    Matches user-entered skills with job roles using NLP + fuzzy matching.
    Filters jobs that have at least 2 matched skills.
    """

    matched_skills = set()

    # Preprocess user input skills
    user_skills = [preprocess_text(skill) for skill in user_skills]

    # Ensure all_skills only contains valid skills
    cleaned_skills = set(skill for skill in all_skills if isinstance(skill, str) and skill.strip())

    # Perform fuzzy matching for each user skill
    for skill in user_skills:
        print(f"\n🔍 Matching User Skill: {skill}")

        # ✅ Exact match logic for single-character user skills (e.g., "r")
        if len(skill.strip()) == 1:
            exact_matches = [s for s in cleaned_skills if s.strip().lower() == skill.strip().lower()]
            if exact_matches:
                matched_skills.add(exact_matches[0])
                print(f"✅ Exact Match for Short Skill: {exact_matches[0]}")
            else:
                print("❌ No exact match found for short skill.")
            continue  # Skip fuzzy matching for this skill

        # Tool match
        tool_raw = map_skills_to_matched_tools(skill, threshold=threshold)
        tool_match = preprocess_text(tool_raw) if tool_raw else None
        tool_match_result = process.extractOne(tool_match, cleaned_skills, scorer=fuzz.WRatio) if tool_match else None
        tool_best, tool_score = tool_match_result[:2] if tool_match_result else (None, 0)
        print(f"Found Tool Match (Tech Tool): {tool_best} with score: {tool_score}")
        # Fuzzy match top 5
        top_matches = process.extract(skill, cleaned_skills, scorer=fuzz.WRatio, limit=5)
        print("Top Matches:")
        for i, (match, score, _) in enumerate(top_matches, start=1):
            print(f"  {i}. {match} ({round(score, 1)})")

        # ✅ Match all top skills with the same max score above threshold
        if tool_best and tool_score >= threshold and tool_score > top_matches[0][1]:
            print(f"✅ Selected Tool Match (Tech Tool): {tool_best}")
            matched_skills.add(tool_best)
        elif top_matches:
            max_score = top_matches[0][1]
            if max_score >= threshold:
                for match, score, _ in top_matches:
                    if score == max_score:
                        # ✅ Prevent single-letter *matched* skills unless perfect match
                        if len(match.strip()) == 1 and score < 100:
                            print(f"⚠️ Skipped short matched skill '{match}' (score: {score}) — not an exact match.")
                            continue
                        matched_skills.add(match)
                        print(f"✅ Selected Tie Match: {match} ({round(score, 1)})")
            elif tool_best and tool_score >= threshold:
                matched_skills.add(tool_best)
                print(f"✅ Selected Tool Match (Tech Tool): {tool_best}")
        else:
            print("❌ No good match found.")

    if not matched_skills:
        print("\n🚫 No valid matches found.")
        return pd.DataFrame(columns=["Job Role", "Matched Skills Count"])

    # Convert matched skills to lowercase for consistency
    matched_skills = set(skill.lower().strip() for skill in matched_skills)

    # Step 2: Create 'All Skills' column per job
    esco_taxonomy_df["All Skills"] = (
        esco_taxonomy_df["Essential Skill"].fillna("").astype(str) + ", " +
        esco_taxonomy_df["Essential Knowledge"].fillna("").astype(str) + ", " +
        esco_taxonomy_df["Optional Skill"].fillna("").astype(str) + ", " +
        esco_taxonomy_df["Optional Knowledge"].fillna("").astype(str)
    )

    # Step 3: Group by Job and create a set of lowercase skill strings
    job_skills_df = (
        esco_taxonomy_df.groupby(["Job Role", "Job Description"])["All Skills"]
        .apply(lambda skills: set(s.strip().lower() for line in skills for s in line.split(",") if s.strip()))
        .reset_index(name="All Skills Set")
    )

    # Step 4: Count how many matched skills appear in each job
    job_skills_df["Matched Skills Count"] = job_skills_df["All Skills Set"].apply(
        lambda skills: len(matched_skills.intersection(skills))
    )

    # Step 5: Filter and sort
    matched_jobs = job_skills_df[job_skills_df["Matched Skills Count"] > 1].copy()
    matched_jobs = matched_jobs.sort_values(by="Matched Skills Count", ascending=False)

    # Step 6: Display results
    print("\n📋 Matched Jobs (At Least 2 Skills Matched):")
    print("-" * 58)
    print("| {:<30} | {:<20} |".format("Job Role", "Matched Skills Count"))
    print("-" * 58)
    for _, row in matched_jobs.iterrows():
        print("| {:<30} | {:<20} |".format(row["Job Role"], row["Matched Skills Count"]))
    print("-" * 58)

    return matched_jobs[["Job Role", "Matched Skills Count"]]



In [98]:
cybersecurity_analyst_skills = ["MySQL", "PostgreSQL", "Database backup and recovery", "Performance tuning", "SQL", "Stored procedures", "Data modelling", "Oracle", "ETL", "Indexing"]

match_jobs(cybersecurity_analyst_skills, threshold=85)


🔍 Matching User Skill: mysql
🔁 'mysql' matched to tool 'mysql' → category 'database management software' (score: 100.0)
Found Tool Match (Tech Tool): database with score: 90.0
Top Matches:
  1. mysql (100.0)
  2. sql server integration service (67.5)
  3. postgresql (67.5)
  4. sql server (67.5)
  5. ml (60.0)
✅ Selected Tie Match: mysql (100.0)

🔍 Matching User Skill: postgresql
🔁 'postgresql' matched to tool 'postgresql' → category 'database management software' (score: 100.0)
Found Tool Match (Tech Tool): database with score: 90.0
Top Matches:
  1. postgresql (100.0)
  2. mysql (67.5)
  3. ml (60.0)
  4. sparql (60.0)
  5. n1ql (60.0)
✅ Selected Tie Match: postgresql (100.0)

🔍 Matching User Skill: database backup recovery
🔁 'database backup recovery' matched to tool 'oracle database administration (dba)' → category 'database management software' (score: 85.5)
Found Tool Match (Tech Tool): database with score: 90.0
Top Matches:
  1. database (90.0)
  2. perform backup (85.5)
  3. u

Unnamed: 0,Job Role,Matched Skills Count
55,database integrator,7
47,data centre operator,6
52,database administrator,6
53,database designer,6
51,data warehouse designer,6
54,database developer,6
75,system configurator,5
29,ICT system integration consultant,3
22,ICT security consultant,2
26,ICT system analyst,2


In [95]:
all_skills

{'',
 '3d light',
 '3d model',
 '3d texturing',
 'abap',
 'acquire system component',
 'adapt change situation',
 'adapt change technological development plan',
 'address problem critically',
 'adhere organisational guideline',
 'adjust ict system capacity',
 'administer ict system',
 'adobe illustrator',
 'adobe photoshop',
 'advise client technical possibility',
 'advise environmental remediation',
 'advise safety improvement',
 'advise strengthen security',
 'agile development',
 'agile project management',
 'ajax',
 'algorithm',
 'align software system architecture',
 'analog electronics theory',
 'analyse big data',
 'analyse business process',
 'analyse business requirement',
 'analyse context organisation',
 'analyse decentralise application',
 'analyse ict system',
 'analyse network bandwidth requirement',
 'analyse pipeline database information',
 'analyse software specification',
 'analyse staff capacity',
 'android',
 'angular',
 'ansible',
 'answer incoming call',
 'apache 

To DO:

# have list of available skills and knowledge, allow user to select these pre defined skills/knowledge and show filtered jobs
# check if it can match the skill in ESCO or tech tools rather than individually >= 90 and then return the ESCO skill. 
#check both and see which has the higher match i suppose? 


1. User enters their skills + knowledge and matches relevant jobs
2. User selects their preferred job, enters their skills + knowledge and returns what they are missing
3. User enters their CV and a list of jobs relevant to their CV is returned
4. User selects their preferred job, enters their CV and application tells them what is missing and what has been found. 
