In [14]:
import re
from difflib import get_close_matches
from itertools import combinations
from collections import defaultdict


In [15]:
tech_roles = {
    "backend_roles": {
        "Backend Developer", "API Developer", "Database Developer", "DevOps Engineer",
         "Cloud Backend Engineer", "Platform Engineer",
        "Infrastructure Engineer", "Systems Engineer", "Software Engineer - Backend",
        "Microservices Developer", "Integration Engineer", "Middleware Engineer",
        "Data Engineer", "Server-side Developer", "Security Engineer - Backend",
        "Performance Engineer", "Application Support Engineer", "Backend Architect",
        "Scalability Engineer", "Java Backend Developer", "Python Backend Developer",
        "Node.js Backend Developer", "Golang Backend Developer", "Ruby Backend Developer",
        "C#/.NET Backend Developer", "PHP Backend Developer", "REST API Developer",
        "GraphQL Developer", "Kubernetes Engineer", "NodeJS Developer", "CI/CD Engineer"
    },
    "frontend_roles": {
        "Frontend Developer", "UI/UX Designer", "Web Designer", "JavaScript Developer",
        "React Developer", "Angular Developer", "Vue.js Developer", "HTML/CSS Developer",
        "Mobile Frontend Developer", "Accessibility Engineer", "Frontend Architect",
        "Interaction Designer", "Motion Designer", "Web Animator", "SASS/LESS Developer",

    },
    "fullstack_roles": {
        "Full Stack Developer", "MEAN Stack Developer", "MERN Stack Developer",
        "LAMP Stack Developer", "Software Engineer - Full Stack", "Web Application Developer",
        "Mobile Full Stack Developer", "Frontend-Backend Integrator", "Cross-Platform Developer",
        "Technical Lead - Full Stack","React","Angular"
    },
    "data_roles": {
        "Data Scientist", "Data Analyst", "Data Engineer", "Data Architect",
        "Data Modeler", "Data Storyteller", "Business Intelligence Analyst",
        "Data Visualization Specialist", "Big Data Engineer", "Quantitative Analyst",
        "Statistician", "Data Governance Analyst", "Data Quality Analyst",
        "Data Mining Specialist", "Predictive Modeler", "ETL Developer",
        "Data Warehouse Engineer", "NLP Engineer", "Computer Vision Engineer",
        "Deep Learning Engineer"
    },
    "ai_ml_roles": {
        "AI Engineer", "Machine Learning Engineer", "Deep Learning Engineer",
        "Natural Language Processing (NLP) Engineer", "Computer Vision Engineer",
        "AI Research Scientist", "AI Trainer", "Generative AI Designer", "Prompt Engineer",
        "AI Operations Specialist", "AI Architect", "AI Product Manager",
        "AI Ethics Specialist", "Reinforcement Learning Engineer", "Speech Recognition Engineer",
        "Recommendation Systems Engineer", "AI Solutions Consultant", "AI Data Analyst",
        "AI Software Developer", "AI QA Engineer"
    },
    "cybersecurity_roles": {
        "Security Analyst", "Penetration Tester", "Network Security Engineer",
        "Application Security Engineer", "Security Architect", "Cryptographer",
        "Digital Forensics Analyst", "Threat Intelligence Analyst", "CISO", "ISSO",
        "Incident Response Analyst", "GRC Analyst", "IAM Analyst", "CTI Analyst", "DPO",
        "DRM", "ICS Security Analyst", "SCADA Security Analyst", "IT Auditor", "SOC Analyst"
    },
    "cloud_roles": {
        "Cloud Engineer", "Cloud Architect", "Cloud Consultant", "Cloud Security Engineer",
        "Cloud DevOps Engineer", "Cloud Systems Administrator", "Cloud Network Engineer",
        "Cloud Support Engineer", "Cloud Solutions Architect", "Cloud Operations Manager",
        "Cloud Compliance Manager", "Cloud Automation Engineer", "Cloud Product Manager",
        "Cloud Sales Engineer"
    },
    "devops_roles": {
        "DevOps Engineer", "Site Reliability Engineer", "IaC Engineer",
        "Release Manager", "Automation Engineer", "Build and Release Engineer",
        "Cloud DevOps Engineer", "Platform Engineer", "Monitoring Engineer"
    },
    "qa_roles": {
        "QA Engineer", "Test Automation Engineer", "Manual Tester", "QA",  "Performance Tester",
        "Security Tester", "Mobile QA Engineer", "QA Analyst", "Test Lead", "SDET",
        "Usability Tester"
    },
    "product_roles": {
        "Product Manager", "Product Owner", "Technical Product Manager", "Product Analyst",
        "Product Designer", "UX Researcher", "Growth Product Manager", "Product Marketing Manager",
        "CPO", "Product Strategist"
    },
    "project_management_roles": {
        "Project Manager", "Scrum Master", "Agile Coach", "Product Owner", "Technical Project Manager",
        "Program Manager", "Delivery Manager", "Release Manager", "Project Coordinator",
        "Portfolio Manager"
    },
    "network_roles": {
        "Network Engineer", "Network Administrator", "Network Architect", "Network Analyst",
        "Wireless Network Engineer", "VoIP Engineer", "Network Security Engineer",
        "NOC Engineer", "LAN/WAN Engineer", "Telecom Engineer"
    },
    "database_roles": {
        "Database Administrator (DBA)", "Database Developer", "Data Architect", "SQL Developer",
        "NoSQL Developer", "Database Analyst", "Data Warehouse Developer", "ETL Developer",
        "Big Data Engineer", "Database Reliability Engineer"
    },
    "support_roles": {
        "Technical Support Engineer", "Help Desk Technician", "IT Support Specialist",
        "Application Support Analyst", "Desktop Support Engineer", "Customer Support Engineer",
        "Field Service Technician", "IT Technician", "Support Analyst", "Service Desk Analyst"
    }
}

In [16]:
tech_domains = {k: k.replace("_roles", "") for k in tech_roles}
role_to_category = {}
category_to_dict_name = {}

# Add tech roles
for category, roles in tech_roles.items():
    domain = category.replace("_roles", "")
    category_to_dict_name[domain] = "tech"
    for role in roles:
        role_to_category[role] = domain

In [17]:
DOMAIN_SIMILARITY_MATRIX = {
    "backend": {"backend": 1.0, "devops": 0.6, "cloud": 0.5, "data": 0.4, "ai_ml": 0.3},
    "frontend": {"frontend": 1.0, "fullstack": 0.7, "product": 0.4},
    "fullstack": {"fullstack": 1.0, "frontend": 0.7, "backend": 0.7},
    "data": {"data": 1.0, "ai_ml": 0.7, "cloud": 0.4, "backend": 0.4, "database": 0.6},
    "ai_ml": {"ai_ml": 1.0, "data": 0.7, "cloud": 0.3},
    "cybersecurity": {"cybersecurity": 1.0, "network": 0.6, "cloud": 0.5},
    "cloud": {"cloud": 1.0, "backend": 0.5, "devops": 0.8, "ai_ml": 0.3},
    "devops": {"devops": 1.0, "cloud": 0.8, "backend": 0.6},
    "qa": {"qa": 1.0},
    "product": {"product": 1.0, "project_management": 0.6, "frontend": 0.4},
    "project_management": {"project_management": 1.0, "product": 0.6},
    "network": {"network": 1.0, "cybersecurity": 0.6},
    "database": {"database": 1.0, "data": 0.6},
    "support": {"support": 1.0},
    "management": {"management": 1.0},
    "finance": {"finance": 1.0},
    "marketing": {"marketing": 1.0},
    "sales": {"sales": 1.0},
    "hr": {"hr": 1.0},
    "legal": {"legal": 1.0},
    "consulting": {"consulting": 1.0, "management": 0.5, "finance": 0.4}
}

domain_similarity = {
    (d1, d2): sim for d1, m in DOMAIN_SIMILARITY_MATRIX.items() for d2, sim in m.items()
}

In [26]:
import re
from difflib import get_close_matches

# Basic normalization: lowercase and remove all non-alphanumeric characters
def normalize_text(text: str) -> str:
    return re.sub(r"[^\w]", "", text.lower())

# Build a set of all known roles
all_roles = set(role_to_category.keys())

# Mapping from normalized role strings to their original format
norm_to_role = {normalize_text(r): r for r in all_roles}

# Fuzzy match a given role to the closest known role using normalized text
def normalize_role(role: str) -> str:
    norm = normalize_text(role)
    match = get_close_matches(norm, norm_to_role.keys(), n=1, cutoff=0.6)
    return norm_to_role[match[0]] if match else role

# Extract keywords from role by splitting normalized string (removes special characters)
def extract_keywords(role: str) -> set:
    return set(normalize_text(role).split())

# Compute similarity between two job roles
def get_role_similarity(role1: str, role2: str) -> float:
    # Normalize both roles to the closest known job titles
    role1 = normalize_role(role1)
    role2 = normalize_role(role2)

    # Exact match
    if role1 == role2:
        return 1.0

    # Get categories for each role (e.g., "AI/ML", "Data", "DevOps")
    cat1 = role_to_category.get(role1)
    cat2 = role_to_category.get(role2)

    # If either role is not categorized, return 0 (no similarity)
    if not cat1 or not cat2:
        return 0.0

    # Get domain dictionary group names (e.g., "ai_ml_roles", "data_roles")
    dict1 = category_to_dict_name.get(cat1)
    dict2 = category_to_dict_name.get(cat2)

    # If roles are in different domains (e.g., Data vs DevOps), return 0
    if dict1 != dict2:
        return 0.0

    # If roles belong to the same category (e.g., both "Data"), compute keyword overlap
    if cat1 == cat2:
        kw1, kw2 = extract_keywords(role1), extract_keywords(role2)
        overlap = len(kw1 & kw2)                         # common keywords
        total = max(len(kw1), len(kw2))                  # normalization factor
        return round(0.5 + 0.3 * (overlap / total if total else 0), 2)

    # If roles belong to different categories in the same domain, use domain similarity matrix
    return domain_similarity.get((cat1, cat2), domain_similarity.get((cat2, cat1), 0.0))


In [27]:
if __name__ == "__main__":
    print(get_role_similarity("AI Data Analyst", "DevOps Engineer"))
    print(get_role_similarity("Data Scientist", "AI Data Analyst"))
    print(get_role_similarity("Dataa scientist", "Data Base Admin"))
    print(get_role_similarity("NodeejS Developer", "NodeJS Developer"))
    print(get_role_similarity("Data Scientist", "Data Analyst"))
    print(get_role_similarity("React Developer", "Angular Developer"))
    print(get_role_similarity("DevOps Engineer", "CI/CD Engineer"))
    print(get_role_similarity("Monitoring Engineer", "Platform Engineer"))
    print(get_role_similarity("Site Reliability Engineeer", "Platform Engineer"))
    print(get_role_similarity("SQL Developer","Data Engineer"))


0.0
0.7
0.6
1.0
0.5
0.5
0.6
0.5
0.5
0.6


###REMOVING THE GENERIC DOMAINS

In [4]:
import re
from difflib import get_close_matches
import pprint

# 1. CLEANING FUNCTION AND GENERIC TITLES
generic_titles = [
    "developer", "engineer", "specialist", "expert", "consultant",
    "analyst", "associate", "technician", "architect", "administrator",
    "manager", "lead", "director", "coordinator", "officer",
    "supervisor", "operator", "executive", "intern", "assistant", "junior",
    "senior", "principal", "head", "chief", "staff", "entry", "mid",
    "trainee", "scientist", "researcher", "owner"
]
pattern = re.compile(r'\b(?:' + '|'.join(re.escape(word) for word in generic_titles) + r')\b', flags=re.IGNORECASE)

def clean_title(title):
    title = title.lower()
    title = pattern.sub('', title)
    title = re.sub(r'[^a-z0-9\s\/\.\-\(\)]', '', title)
    title = re.sub(r'\s+', ' ', title).strip()
    return title

def normalize_text(text: str) -> str:
    return re.sub(r"[^\w]", "", text.lower())

def extract_keywords(role: str) -> set:
    return set(normalize_text(role).split())


In [5]:
# 2. INPUT DICTIONARY (only showing a few roles for brevity – replace with your full dict)
tech_roles = {
    "backend_roles": {
        "Backend Developer", "API Developer", "Database Developer", "DevOps Engineer",
         "Cloud Backend Engineer", "Platform Engineer",
        "Infrastructure Engineer", "Systems Engineer", "Software Engineer - Backend",
        "Microservices Developer", "Integration Engineer", "Middleware Engineer",
        "Data Engineer", "Server-side Developer", "Security Engineer - Backend",
        "Performance Engineer", "Application Support Engineer", "Backend Architect",
        "Scalability Engineer", "Java Backend Developer", "Python Backend Developer",
        "Node.js Backend Developer", "Golang Backend Developer", "Ruby Backend Developer",
        "C#/.NET Backend Developer", "PHP Backend Developer", "REST API Developer",
        "GraphQL Developer", "Kubernetes Engineer", "NodeJS Developer", "CI/CD Engineer"
    },
    "frontend_roles": {
        "Frontend Developer", "UI/UX Designer", "Web Designer", "JavaScript Developer",
        "React Developer", "Angular Developer", "Vue.js Developer", "HTML/CSS Developer",
        "Mobile Frontend Developer", "Accessibility Engineer", "Frontend Architect",
        "Interaction Designer", "Motion Designer", "Web Animator", "SASS/LESS Developer",
    },
    "fullstack_roles": {
        "Full Stack Developer", "MEAN Stack Developer", "MERN Stack Developer",
        "LAMP Stack Developer", "Software Engineer - Full Stack", "Web Application Developer",
        "Mobile Full Stack Developer", "Frontend-Backend Integrator", "Cross-Platform Developer",
        "Web Application Developer","React","Angular"
    },
    "data_roles": {
        "Data Scientist", "Data Analyst", "Data Engineer", "Data Architect",
        "Data Modeler", "Data Storyteller", "Business Intelligence Analyst",
        "Data Visualization Specialist", "Big Data Engineer", "Quantitative Analyst",
        "Statistician", "Data Governance Analyst", "Data Quality Analyst",
        "Data Mining Specialist", "Predictive Modeler", "ETL Developer",
        "Data Warehouse Engineer", "NLP Engineer", "Computer Vision Engineer",
        "Deep Learning Engineer"
    },
    "ai_ml_roles": {
        "AI Engineer", "Machine Learning Engineer", "Deep Learning Engineer",
        "Natural Language Processing (NLP) Engineer", "Computer Vision Engineer",
        "AI Research Scientist", "AI Trainer", "Generative AI Designer", "Prompt Engineer",
        "AI Operations Specialist", "AI Architect", "AI Product Manager",
        "AI Ethics Specialist", "Reinforcement Learning Engineer", "Speech Recognition Engineer",
        "Recommendation Systems Engineer", "AI Solutions Consultant", "AI Data Analyst",
        "AI Software Developer", "AI QA Engineer","ML Engineer"
    },
    "cybersecurity_roles": {
        "Security Analyst", "Penetration Tester", "Network Security Engineer",
        "Application Security Engineer", "Security Architect", "Cryptographer",
        "Digital Forensics Analyst", "Threat Intelligence Analyst", "CISO", "ISSO",
        "Incident Response Analyst", "GRC Analyst", "IAM Analyst", "CTI Analyst", "DPO",
        "DRM", "ICS Security Analyst", "SCADA Security Analyst", "IT Auditor", "SOC Analyst"
    },
    "cloud_roles": {
        "Cloud Engineer", "Cloud Architect", "Cloud Consultant", "Cloud Security Engineer",
        "Cloud DevOps Engineer", "Cloud Systems Administrator", "Cloud Network Engineer",
        "Cloud Support Engineer", "Cloud Solutions Architect", "Cloud Operations Manager",
        "Cloud Compliance Manager", "Cloud Automation Engineer", "Cloud Product Manager",
        "Cloud Sales Engineer"
    },
    "devops_roles": {
        "DevOps Engineer", "Site Reliability Engineer", "CI/CD Engineer", "IaC Engineer",
        "Release Manager", "Automation Engineer", "Build and Release Engineer",
        "Cloud DevOps Engineer", "Platform Engineer", "Monitoring Engineer"
    },
    "qa_roles": {
        "QA Engineer", "Test Automation Engineer", "Manual Tester", "Performance Tester",
        "Security Tester", "Mobile QA Engineer", "QA Analyst", "Test Lead", "SDET",
        "Usability Tester"
    },
    "product_roles": {
        "Product Manager", "Product Owner", "Technical Product Manager", "Product Analyst",
        "Product Designer", "UX Researcher", "Growth Product Manager", "Product Marketing Manager",
        "CPO", "Product Strategist"
    },
    "project_management_roles": {
        "Project Manager", "Scrum Master", "Agile Coach", "Product Owner", "Technical Project Manager",
        "Program Manager", "Delivery Manager", "Release Manager", "Project Coordinator",
        "Portfolio Manager"
    },
    "network_roles": {
        "Network Engineer", "Network Administrator", "Network Architect", "Network Analyst",
        "Wireless Network Engineer", "VoIP Engineer", "Network Security Engineer",
        "NOC Engineer", "LAN/WAN Engineer", "Telecom Engineer"
    },
    "database_roles": {
        "Database Administrator (DBA)", "Database Developer", "Data Architect", "SQL Developer",
        "NoSQL Developer", "Database Analyst", "Data Warehouse Developer", "ETL Developer",
        "Big Data Engineer", "Database Reliability Engineer"
    },
    "support_roles": {
        "Technical Support Engineer", "Help Desk Technician", "IT Support Specialist",
        "Application Support Analyst", "Desktop Support Engineer", "Customer Support Engineer",
        "Field Service Technician", "IT Technician", "Support Analyst", "Service Desk Analyst"
    }
}


In [6]:

# 3. CLEAN ROLES
cleaned_tech_roles = {}
for category, roles in tech_roles.items():
    cleaned_tech_roles[category] = set(clean_title(role) for role in roles)

tech_roles = cleaned_tech_roles  # overwrite original


In [7]:


# 4. BUILD ROLE TO CATEGORY MAPPING
role_to_category = {}
category_to_dict_name = {}

for category, roles in tech_roles.items():
    domain = category.replace("_roles", "")
    category_to_dict_name[domain] = "tech"
    for role in roles:
        role_to_category[role] = domain

# 5. DOMAIN SIMILARITY MATRIX
DOMAIN_SIMILARITY_MATRIX = {
    "backend": {"backend": 1.0, "devops": 0.6, "cloud": 0.5, "data": 0.4, "ai_ml": 0.3},
    "frontend": {"frontend": 1.0, "fullstack": 0.7, "product": 0.4},
    "fullstack": {"fullstack": 1.0, "frontend": 0.7, "backend": 0.7},
    "data": {"data": 1.0, "ai_ml": 0.7, "cloud": 0.4, "backend": 0.4, "database": 0.6},
    "ai_ml": {"ai_ml": 1.0, "data": 0.7, "cloud": 0.3},
    "cybersecurity": {"cybersecurity": 1.0, "network": 0.6, "cloud": 0.5},
    "cloud": {"cloud": 1.0, "backend": 0.5, "devops": 0.8, "ai_ml": 0.3},
    "devops": {"devops": 1.0, "cloud": 0.8, "backend": 0.6},
    "qa": {"qa": 1.0},
    "product": {"product": 1.0, "project_management": 0.6, "frontend": 0.4},
    "project_management": {"project_management": 1.0, "product": 0.6},
    "network": {"network": 1.0, "cybersecurity": 0.6},
    "database": {"database": 1.0, "data": 0.6},
    "support": {"support": 1.0},

}
domain_similarity = {
    (d1, d2): sim for d1, m in DOMAIN_SIMILARITY_MATRIX.items() for d2, sim in m.items()
}

In [8]:
# 6. BUILD FUZZY MATCH INDEX
all_roles = set(role_to_category.keys())
norm_to_role = {normalize_text(r): r for r in all_roles}

def normalize_role(role: str) -> str:
    norm = normalize_text(role)
    match = get_close_matches(norm, norm_to_role.keys(), n=1, cutoff=0.6)
    if match:
        return norm_to_role[match[0]]

    # Fallback: if norm is a substring of any known role
    for norm_key, orig_role in norm_to_role.items():
        if norm in norm_key:
            return orig_role

    return role  # return as-is if no match


# 7. SIMILARITY FUNCTION
def get_role_similarity(role1: str, role2: str) -> float:
    role1 = clean_title(role1)
    role2 = clean_title(role2)

    role1 = normalize_role(role1)
    role2 = normalize_role(role2)

    if role1 == role2:
        return 1.0

    cat1 = role_to_category.get(role1)
    cat2 = role_to_category.get(role2)
    if not cat1 or not cat2:
        return 0.0

    dict1 = category_to_dict_name.get(cat1)
    dict2 = category_to_dict_name.get(cat2)
    if dict1 != dict2:
        return 0.0

    if cat1 == cat2:
        kw1, kw2 = extract_keywords(role1), extract_keywords(role2)
        overlap = len(kw1 & kw2)
        total = max(len(kw1), len(kw2))
        return round(0.5 + 0.3 * (overlap / total if total else 0), 2)

    return domain_similarity.get((cat1, cat2), domain_similarity.get((cat2, cat1), 0.0))


In [12]:
if __name__ == "__main__":
    print(get_role_similarity("DevOps Engineer", "CI/CD Engineer"))
    print(get_role_similarity("Monitoring Engineer", "Platform Engineer"))
    print(get_role_similarity("Site Reliability Engineeer", "Platform Engineer"))
    print(get_role_similarity("Dataa scientist", "Data Base Admin"))
    print(get_role_similarity("NodeejS Developer", "NodeJS Developer"))
    print(get_role_similarity("Data Scientist", "Data Analyst"))      # GETTING 1 AS OUTPUT    AS AFTER CLEANING, it would be (DATA, DATA),so Output is 1.     But, roles are not same
    print(get_role_similarity("React Developer", "Angular Developer"))
    print(get_role_similarity("DevOps Engineer", "CI/CD Engineer"))
    print(get_role_similarity("Monitoring Engineer", "Platform Engineer"))
    print(get_role_similarity("Site Reliability Engineeer", "Platform Engineer"))
    print(get_role_similarity("SQL Developer","Data Engineer"))


0.5
0.5
0.5
0.5
1.0
1.0
0.5
0.5
0.5
0.5
0.5
