# 05 - Job Role Categorization

## Objective
Categorize job postings into AI/ML roles, General IT roles, Hybrid roles, and Non-Tech roles using keyword-based classification.


## Expected Output
- New column: `role_category` (AI/ML, General IT, Hybrid, Non-Tech)
- Updated dataset: `data/processed/categorized_jobs.csv`
- Validation report with accuracy metrics
- Categorization summary documentation


## 1. Environment Setup


In [None]:
import pandas as pd
import numpy as np
import re
from collections import Counter, defaultdict
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
pd.set_option('display.max_rows', 100)

print(" Libraries imported successfully")
print(f"   Pandas version: {pd.__version__}")


## 2. Load Data


In [None]:
print(" Loading standardized dataset...\n")

df = pd.read_csv('data/processed/jobs_with_standardized_companies.csv')

print("="*70)
print(" DATASET LOADED")
print("="*70)

print(f"\n Dataset Overview:")
print(f"   Total records: {len(df):,}")
print(f"   Unique companies: {df['company_clean'].nunique():,}")
print(f"   Unique roles: {df['role'].nunique():,}")
print(f"   Date range: {df['posting_date'].min()} to {df['posting_date'].max()}")

print(f"\n Current AI flagging (has_ai_keywords):")
print(f"   Jobs with AI keywords: {df['has_ai_keywords'].sum():,} ({df['has_ai_keywords'].sum()/len(df)*100:.1f}%)")
print(f"   Jobs without AI keywords: {(~df['has_ai_keywords']).sum():,} ({(~df['has_ai_keywords']).sum()/len(df)*100:.1f}%)")


## 3. Define Categorization Keywords


In [None]:
print(" Defining categorization keyword sets...\n")

AI_ML_KEYWORDS = {
    'role_titles': [
        r'\bml\s+engineer\b',
        r'\bmachine\s+learning\s+engineer\b',
        r'\bai\s+engineer\b',
        r'\bartificial\s+intelligence\s+engineer\b',
        r'\bdata\s+scientist\b',
        r'\bresearch\s+scientist\b.*\b(ai|ml|machine learning)\b',
        r'\bnlp\s+engineer\b',
        r'\bnatural\s+language\s+processing\b',
        r'\bcomputer\s+vision\s+engineer\b',
        r'\bdeep\s+learning\s+engineer\b',
        r'\bai\s+researcher\b',
        r'\bml\s+researcher\b',
        r'\bapplied\s+scientist\b',
        r'\bai\s+specialist\b',
        r'\bml\s+specialist\b',
        r'\bai\s+architect\b',
        r'\bml\s+architect\b',
        r'\bllm\s+engineer\b',
        r'\bgenerative\s+ai\b',
    ],
    'technologies': [
        r'\btensorflow\b',
        r'\bpytorch\b',
        r'\bkeras\b',
        r'\bscikit-learn\b',
        r'\bsklearn\b',
        r'\bxgboost\b',
        r'\blightgbm\b',
        r'\bhuggingface\b',
        r'\btransformers\b.*\b(model|llm)\b',
        r'\bgpt-?[0-9]\b',
        r'\bbert\b.*\b(model|nlp)\b',
        r'\bllm\b',
        r'\blarge\s+language\s+model\b',
    ],
    'tasks': [
        r'\bmodel\s+training\b',
        r'\bmodel\s+development\b',
        r'\bmodel\s+deployment\b',
        r'\bneural\s+network\b',
        r'\bdeep\s+learning\b',
        r'\bmachine\s+learning\b',
        r'\breinforcement\s+learning\b',
        r'\bcomputer\s+vision\b',
        r'\bnatural\s+language\s+processing\b',
        r'\bsentiment\s+analysis\b',
        r'\btext\s+classification\b',
        r'\bimage\s+recognition\b',
        r'\bobject\s+detection\b',
        r'\brecommendation\s+system\b',
        r'\bpredictive\s+analytics\b',
        r'\bembedding\b',
        r'\bfine-?tuning\b.*\b(model|llm)\b',
    ]
}

GENERAL_IT_KEYWORDS = {
    'role_titles': [
        r'\bsoftware\s+engineer\b',
        r'\bsoftware\s+developer\b',
        r'\bweb\s+developer\b',
        r'\bfrontend\s+(engineer|developer)\b',
        r'\bbackend\s+(engineer|developer)\b',
        r'\bfull\s*stack\s+(engineer|developer)\b',
        r'\bdevops\s+engineer\b',
        r'\bsite\s+reliability\s+engineer\b',
        r'\bsre\b',
        r'\bcloud\s+engineer\b',
        r'\bplatform\s+engineer\b',
        r'\binfrastructure\s+engineer\b',
        r'\bnetwork\s+engineer\b',
        r'\bsecurity\s+engineer\b',
        r'\bqa\s+engineer\b',
        r'\btest\s+engineer\b',
        r'\bmobile\s+(developer|engineer)\b',
        r'\bios\s+developer\b',
        r'\bandroid\s+developer\b',
        r'\bui/ux\s+engineer\b',
        r'\bembedded\s+engineer\b',
        r'\bsystems\s+engineer\b',
    ],
    'web_technologies': [
        r'\breact\.?js\b',
        r'\breact\b',
        r'\bangular\b',
        r'\bvue\.?js\b',
        r'\bnode\.?js\b',
        r'\bjavascript\b',
        r'\btypescript\b',
        r'\bhtml\b',
        r'\bcss\b',
        r'\bsass\b',
        r'\bredux\b',
        r'\bnext\.?js\b',
    ],
    'backend_technologies': [
        r'\bjava\b(?!script)',
        r'\bspring\s+boot\b',
        r'\b\.net\b',
        r'\bc#\b',
        r'\bgo\s+(language|lang)?\b',
        r'\bgolang\b',
        r'\brush\b',
        r'\bruby\s+on\s+rails\b',
        r'\bphp\b',
        r'\bdjango\b',
        r'\bflask\b',
        r'\bfastapi\b',
        r'\brest\s+api\b',
        r'\bgraphql\b',
        r'\bmicroservices\b',
    ],
    'devops_technologies': [
        r'\bkubernetes\b',
        r'\bdocker\b',
        r'\baws\b',
        r'\bazure\b',
        r'\bgcp\b',
        r'\bterraform\b',
        r'\bansible\b',
        r'\bjenkins\b',
        r'\bci/cd\b',
        r'\bgitlab\b',
        r'\bgithub\s+actions\b',
    ]
}

HYBRID_INDICATORS = [
    r'\bml\s+infrastructure\b',
    r'\bai\s+platform\b',
    r'\bml\s+platform\b',
    r'\bmlops\b',
    r'\bai\s+infrastructure\b',
    r'\bmodel\s+serving\b',
    r'\bml\s+system\b',
]

NON_TECH_KEYWORDS = [
    r'\bnurse\b',
    r'\bnursing\b',
    r'\bphysician\b',
    r'\bdoctor\b',
    r'\bradiology\b',
    r'\bmedical\b',
    r'\bhealthcare\b',
    r'\btechnologist\b.*\b(mri|ct|radiology)\b',
    r'\bsales\s+associate\b',
    r'\bretail\b',
    r'\bcashier\b',
    r'\bvalet\b',
    r'\bdriver\b',
    r'\bcustomer\s+service\b',
    r'\baccounting\b',
    r'\bfinance\b.*\b(analyst|manager)\b',
    r'\bhr\s+(specialist|manager)\b',
    r'\bhuman\s+resources\b',
    r'\bmarketing\b',
]

print(" Keyword sets defined:")
print(f"\n   AI/ML Keywords:")
print(f"      • Role titles: {len(AI_ML_KEYWORDS['role_titles'])} patterns")
print(f"      • Technologies: {len(AI_ML_KEYWORDS['technologies'])} patterns")
print(f"      • Tasks: {len(AI_ML_KEYWORDS['tasks'])} patterns")

print(f"\n   General IT Keywords:")
print(f"      • Role titles: {len(GENERAL_IT_KEYWORDS['role_titles'])} patterns")
print(f"      • Web technologies: {len(GENERAL_IT_KEYWORDS['web_technologies'])} patterns")
print(f"      • Backend technologies: {len(GENERAL_IT_KEYWORDS['backend_technologies'])} patterns")
print(f"      • DevOps technologies: {len(GENERAL_IT_KEYWORDS['devops_technologies'])} patterns")

print(f"\n   Hybrid indicators: {len(HYBRID_INDICATORS)} patterns")
print(f"   Non-tech indicators: {len(NON_TECH_KEYWORDS)} patterns")


## 4. Build Categorization Function


In [None]:
def count_keyword_matches(text, keyword_patterns):
    """Count number of keyword pattern matches in text"""
    if pd.isna(text):
        return 0
    
    text_lower = str(text).lower()
    matches = 0
    
    for pattern in keyword_patterns:
        if re.search(pattern, text_lower):
            matches += 1
    
    return matches


def calculate_ai_ml_score(role, description):
    """Calculate AI/ML relevance score (0-100)"""
    score = 0
    
    role_matches = count_keyword_matches(role, AI_ML_KEYWORDS['role_titles'])
    score += role_matches * 30
    
    tech_matches = count_keyword_matches(role, AI_ML_KEYWORDS['technologies'])
    tech_matches += count_keyword_matches(description, AI_ML_KEYWORDS['technologies']) * 0.3
    score += min(tech_matches * 10, 20)
    
    task_matches = count_keyword_matches(role, AI_ML_KEYWORDS['tasks'])
    task_matches += count_keyword_matches(description, AI_ML_KEYWORDS['tasks']) * 0.2
    score += min(task_matches * 8, 20)
    
    return min(score, 100)


def calculate_it_score(role, description):
    """Calculate General IT relevance score (0-100)"""
    score = 0
    
    role_matches = count_keyword_matches(role, GENERAL_IT_KEYWORDS['role_titles'])
    score += role_matches * 25
    
    web_matches = count_keyword_matches(role, GENERAL_IT_KEYWORDS['web_technologies'])
    web_matches += count_keyword_matches(description, GENERAL_IT_KEYWORDS['web_technologies']) * 0.3
    score += min(web_matches * 8, 15)
    
    backend_matches = count_keyword_matches(role, GENERAL_IT_KEYWORDS['backend_technologies'])
    backend_matches += count_keyword_matches(description, GENERAL_IT_KEYWORDS['backend_technologies']) * 0.3
    score += min(backend_matches * 8, 15)
    
    devops_matches = count_keyword_matches(role, GENERAL_IT_KEYWORDS['devops_technologies'])
    devops_matches += count_keyword_matches(description, GENERAL_IT_KEYWORDS['devops_technologies']) * 0.3
    score += min(devops_matches * 8, 15)
    
    return min(score, 100)


def is_non_tech_role(role, description):
    """Check if role is clearly non-technical"""
    combined_text = f"{role} {description}"
    matches = count_keyword_matches(combined_text, NON_TECH_KEYWORDS)
    return matches > 0


def is_hybrid_role(role, description):
    """Check if role shows hybrid AI/ML + IT characteristics"""
    combined_text = f"{role} {description}"
    matches = count_keyword_matches(combined_text, HYBRID_INDICATORS)
    return matches > 0


def categorize_job_role(role, description):
    """
    Categorize a job into one of four categories:
    - 'AI/ML': Primarily AI/Machine Learning roles
    - 'General IT': Software engineering, DevOps, web development, etc.
    - 'Hybrid': Roles with both AI/ML and IT components
    - 'Non-Tech': Non-technical roles
    """
    if is_non_tech_role(role, description):
        return 'Non-Tech'
    
    ai_score = calculate_ai_ml_score(role, description)
    it_score = calculate_it_score(role, description)
    
    has_hybrid_indicators = is_hybrid_role(role, description)
    
    AI_THRESHOLD = 30
    IT_THRESHOLD = 25
    HYBRID_MIN_THRESHOLD = 15
    
    if ai_score >= AI_THRESHOLD and it_score >= IT_THRESHOLD:
        return 'Hybrid'
    
    if ai_score >= AI_THRESHOLD:
        if has_hybrid_indicators and it_score >= HYBRID_MIN_THRESHOLD:
            return 'Hybrid'
        return 'AI/ML'
    
    if it_score >= IT_THRESHOLD:
        if has_hybrid_indicators and ai_score >= HYBRID_MIN_THRESHOLD:
            return 'Hybrid'
        return 'General IT'
    
    if ai_score > 0 or it_score > 0:
        if ai_score > it_score:
            return 'AI/ML'
        else:
            return 'General IT'
    
    return 'Non-Tech'


print(" Categorization functions defined:")
print("   • count_keyword_matches()")
print("   • calculate_ai_ml_score()")
print("   • calculate_it_score()")
print("   • is_non_tech_role()")
print("   • is_hybrid_role()")
print("   • categorize_job_role() [main function]")


In [None]:
print(" Testing categorization function with sample roles...\n")

test_cases = [
    ("Machine Learning Engineer", "Build and deploy ML models using TensorFlow and PyTorch"),
    ("Senior Software Engineer", "Develop web applications using React and Node.js"),
    ("AI Platform Engineer", "Build MLOps infrastructure for model deployment"),
    ("Registered Nurse Telemetry", "Provide patient care in hospital setting"),
    ("Data Scientist", "Analyze data and build predictive models"),
    ("DevOps Engineer", "Manage Kubernetes infrastructure and CI/CD pipelines"),
    ("Retail Sales Associate", "Assist customers with purchases and inventory"),
    ("Software Engineer - AI Model Serving", "Build infrastructure for ML model deployment"),
]

print("="*70)
print(" TEST RESULTS")
print("="*70)

for role, desc in test_cases:
    category = categorize_job_role(role, desc)
    ai_score = calculate_ai_ml_score(role, desc)
    it_score = calculate_it_score(role, desc)
    
    print(f"\nRole: {role}")
    print(f"   Category: {category}")
    print(f"   AI Score: {ai_score:.1f} | IT Score: {it_score:.1f}")


## 5. Apply Categorization to All Jobs


In [None]:
print(" Applying categorization to all job postings...\n")
print("="*70)

from tqdm import tqdm
tqdm.pandas()

df['role_category'] = df.progress_apply(
    lambda row: categorize_job_role(row['role'], row['description']),
    axis=1
)

print("\n Categorization complete!")
print(f"   Total jobs categorized: {len(df):,}")

print("\n Category Distribution:")
print("="*70)

category_counts = df['role_category'].value_counts()
for category, count in category_counts.items():
    pct = (count / len(df)) * 100
    print(f"   {category:<15} {count:>5} jobs ({pct:>5.1f}%)")

print("\n" + "="*70)


## 6. Validation & Quality Analysis


In [None]:
print("="*70)
print(" CATEGORIZATION VALIDATION")
print("="*70)

print("\n1⃣  Comparison with AI Keyword Flag:")
print("   (has_ai_keywords vs role_category)")

comparison = pd.crosstab(
    df['has_ai_keywords'], 
    df['role_category'], 
    margins=True
)
print(comparison)

ai_ml_with_flag = df[(df['role_category'] == 'AI/ML') & (df['has_ai_keywords'] == True)].shape[0]
ai_ml_total = df[df['role_category'] == 'AI/ML'].shape[0]

if ai_ml_total > 0:
    consistency = (ai_ml_with_flag / ai_ml_total) * 100
    print(f"\n   Consistency check: {consistency:.1f}% of AI/ML roles have has_ai_keywords=True")

print("\n\n2️⃣  Top 10 Companies by Category:")

for category in ['AI/ML', 'General IT', 'Hybrid']:
    print(f"\n   {category}:")
    top_companies = df[df['role_category'] == category]['company_clean'].value_counts().head(10)
    for idx, (company, count) in enumerate(top_companies.items(), 1):
        print(f"      {idx:2d}. {company[:40]:<40} {count:>3} jobs")

print("\n\n3️⃣  Sample Categorizations (Random Selection):")

for category in ['AI/ML', 'General IT', 'Hybrid', 'Non-Tech']:
    sample = df[df['role_category'] == category].sample(min(3, len(df[df['role_category'] == category])))
    print(f"\n   {category} Examples:")
    for idx, row in sample.iterrows():
        print(f"      • {row['role'][:60]}")
        print(f"        Company: {row['company_clean']}")
        ai_score = calculate_ai_ml_score(row['role'], row['description'])
        it_score = calculate_it_score(row['role'], row['description'])
        print(f"        Scores - AI: {ai_score:.0f}, IT: {it_score:.0f}")
        print()


## 7. Category Distribution by Source & Time


In [None]:
print("="*70)
print(" CATEGORY DISTRIBUTION ANALYSIS")
print("="*70)

print("\n1️  Distribution by Data Source:")

source_category = pd.crosstab(
    df['source'],
    df['role_category'],
    normalize='index'
) * 100

print("\n   Percentage Distribution:")
print(source_category.round(1))

print("\n   Absolute Counts:")
source_category_counts = pd.crosstab(df['source'], df['role_category'])
print(source_category_counts)

print("\n\n2️⃣  Preparing for Time Series Analysis:")

df['posting_date'] = pd.to_datetime(df['posting_date'])
df['year_month'] = df['posting_date'].dt.to_period('M')

monthly_categories = df.groupby(['year_month', 'role_category']).size().unstack(fill_value=0)

print(f"\n   Date range: {df['posting_date'].min()} to {df['posting_date'].max()}")
print(f"   Total months covered: {df['year_month'].nunique()}")
print(f"\n   Monthly breakdown preview:")
print(monthly_categories.head(10))

print("\n\n3️  Tech Roles Summary (AI/ML + General IT + Hybrid):")

tech_roles = df[df['role_category'].isin(['AI/ML', 'General IT', 'Hybrid'])]
non_tech = df[df['role_category'] == 'Non-Tech']

print(f"   Total tech roles: {len(tech_roles):,} ({len(tech_roles)/len(df)*100:.1f}%)")
print(f"   Total non-tech roles: {len(non_tech):,} ({len(non_tech)/len(df)*100:.1f}%)")

print(f"\n   Tech roles breakdown:")
for category in ['AI/ML', 'General IT', 'Hybrid']:
    count = (df['role_category'] == category).sum()
    pct_of_tech = (count / len(tech_roles)) * 100 if len(tech_roles) > 0 else 0
    print(f"      {category:<15} {count:>5} jobs ({pct_of_tech:>5.1f}% of tech roles)")


## 8. Save Categorized Dataset


In [None]:
import os

output_file = 'data/processed/categorized_jobs.csv'

df.to_csv(output_file, index=False)

print(" CATEGORIZED DATASET SAVED")
print("="*70)
print(f"\n   File: {output_file}")
print(f"   Size: {os.path.getsize(output_file) / 1024 / 1024:.2f} MB")
print(f"   Records: {len(df):,}")
print(f"   Columns: {len(df.columns)}")

print(f"\n   Key columns:")
print(f"      • role_category (NEW)")
print(f"      • year_month (NEW)")
print(f"      • company_clean")
print(f"      • has_ai_keywords")
print(f"      • posting_date")

print(f"\n Phase 2.3 Complete: Job Role Categorization")
print(f"\n   Dataset ready for time series analysis")
print("="*70)


## 9. Final Summary & Documentation


In [None]:
print("\n" + "="*70)
print(" JOB ROLE CATEGORIZATION SUMMARY")
print("="*70)

print(f"\n Categorization Results:")
print(f"   Total jobs processed: {len(df):,}")
print(f"\n   Category Breakdown:")

for category in ['AI/ML', 'General IT', 'Hybrid', 'Non-Tech']:
    count = (df['role_category'] == category).sum()
    pct = (count / len(df)) * 100
    print(f"      {category:<15} {count:>5} jobs ({pct:>5.1f}%)")

print(f"\n Categorization Methodology:")
print(f"   • Keyword-based scoring system")
print(f"   • AI/ML keywords: {len(AI_ML_KEYWORDS['role_titles']) + len(AI_ML_KEYWORDS['technologies']) + len(AI_ML_KEYWORDS['tasks'])} patterns")
print(f"   • General IT keywords: {sum(len(v) for v in GENERAL_IT_KEYWORDS.values())} patterns")
print(f"   • Hybrid indicators: {len(HYBRID_INDICATORS)} patterns")
print(f"   • Non-tech filters: {len(NON_TECH_KEYWORDS)} patterns")

print(f"\n Data Quality Metrics:")
print(f"   • Null role_category values: {df['role_category'].isnull().sum()}")
print(f"   • Date range: {df['posting_date'].min().strftime('%Y-%m-%d')} to {df['posting_date'].max().strftime('%Y-%m-%d')}")
print(f"   • Months of data: {df['year_month'].nunique()}")
print(f"   • Complete records: {len(df[df['role_category'].notna()]):,}")

print(f"\n Key Insights:")

ai_ml_count = (df['role_category'] == 'AI/ML').sum()
general_it_count = (df['role_category'] == 'General IT').sum()
hybrid_count = (df['role_category'] == 'Hybrid').sum()

if general_it_count > 0:
    ai_it_ratio = ai_ml_count / general_it_count
    print(f"   • AI/ML to General IT ratio: {ai_it_ratio:.2f}:1")

tech_total = ai_ml_count + general_it_count + hybrid_count
if tech_total > 0:
    ai_share = (ai_ml_count / tech_total) * 100
    it_share = (general_it_count / tech_total) * 100
    hybrid_share = (hybrid_count / tech_total) * 100
    print(f"   • AI/ML share of tech roles: {ai_share:.1f}%")
    print(f"   • General IT share of tech roles: {it_share:.1f}%")
    print(f"   • Hybrid share of tech roles: {hybrid_share:.1f}%")

print("\n" + "="*70)
print(" CATEGORIZATION COMPLETE - READY FOR TIME SERIES ANALYSIS")
print("="*70)


In [None]:
print("\n Sample of categorized data:\n")
sample_cols = ['role', 'company_clean', 'role_category', 'has_ai_keywords', 'year_month', 'source']
df[sample_cols].head(20)
