In [18]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

#dummy data
bank_jobs_skills = {
    "Data Analyst": [["Python", "Data Analysis"], ["Excel", "SQL"], ["Power BI", "Statistics"]],
    "Credit Analyst": [["Financial Analysis", "Credit Risk"], ["Underwriting", "Excel"], ["Loan Review", "Risk Assessment"]],
    "Loan Officer": [["Customer Service", "Loan Origination"], ["Credit Reports", "Regulatory Knowledge"], ["Loan Processing", "CRM"]],
    "Compliance Analyst": [["Regulatory Compliance", "AML"], ["KYC", "Audit"], ["Policy Review", "Banking Laws"]],
    "Branch Manager": [["Leadership", "Sales"], ["Branch Operations", "Customer Experience"], ["Staff Training", "Team Management"]],
    "Bank Teller": [["Cash Handling", "Customer Service"], ["POS Systems", "Transaction Processing"], ["Fraud Detection", "Bank Procedures"]],
    "Risk Manager": [["Risk Modeling", "Market Risk"], ["Operational Risk", "Stress Testing"], ["Scenario Analysis", "Compliance"]],
    "Investment Analyst": [["Equity Research", "Valuation"], ["Portfolio Management", "Bloomberg"], ["Risk-Return Analysis", "Financial Modeling"]],
    "BI Analyst": [["Power BI", "Data Visualization"], ["Excel", "SQL"], ["Tableau", "Business Intelligence"]],
    "IT Auditor": [["Internal Controls", "IT Security"], ["Audit Tools", "Compliance Testing"], ["Access Management", "Risk Auditing"]],
    "Fraud Investigator": [["Transaction Monitoring", "Fraud Detection"], ["Forensics", "Surveillance"], ["Case Review", "Reporting"]],
    "Financial Advisor": [["Investment Strategy", "Client Communication"], ["Retirement Planning", "Wealth Management"], ["Portfolio Review", "Risk Tolerance"]],
    "Treasury Analyst": [["Cash Management", "Liquidity Planning"], ["Bank Transfers", "Forecasting"], ["Treasury Operations", "Funding"]],
    "Operations Analyst": [["Process Improvement", "Workflow Analysis"], ["Data Entry", "Reporting"], ["Bank Ops", "Compliance Checks"]],
    "Mortgage Underwriter": [["Credit Reports", "Loan-to-Value"], ["Income Verification", "DTI"], ["Appraisals", "Underwriting Guidelines"]],
    "Internal Auditor": [["Internal Controls", "Risk Assessment"], ["Audit Planning", "Compliance Testing"], ["Documentation", "Reporting"]],
    "AML Analyst": [["Anti-Money Laundering", "SAR Filing"], ["KYC Review", "Transaction Patterns"], ["Customer Due Diligence", "Risk Scoring"]],
    "Product Manager": [["Market Research", "Product Development"], ["Roadmapping", "Stakeholder Communication"], ["KPIs", "Customer Feedback"]],
    "Customer Success Manager": [["Client Retention", "Onboarding"], ["CRM", "Upselling"], ["Customer Satisfaction", "Product Adoption"]],
    "Cybersecurity Analyst": [["Intrusion Detection", "SIEM"], ["Firewalls", "Incident Response"], ["Threat Intelligence", "Endpoint Security"]],
}

#Generate 10 samples per job title
expanded_data = []
id_counter = 1
for job, skill_sets in bank_jobs_skills.items():
    for i in range(10):
        skill_set = random.choice(skill_sets)
        expanded_data.append({
            "employee_id": id_counter,
            "skills": skill_set,
            "job": job
        })
        id_counter += 1

df = pd.DataFrame(expanded_data)


#Encode skills and job labels
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['skills'])

df['job_label'], job_classes = pd.factorize(df['job'])
y = df['job_label']

#Train-test split
X_train, X_test, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)


#Remap labels for compatibility
unique_train_labels = np.unique(y_train_orig)
label_map = {old: new for new, old in enumerate(unique_train_labels)}
reverse_map = {v: k for k, v in label_map.items()}

y_train = np.array([label_map[label] for label in y_train_orig])

valid_test_mask = [label in label_map for label in y_test_orig]
X_test = X_test[valid_test_mask]
y_test_orig = y_test_orig[valid_test_mask]
y_test = np.array([label_map[label] for label in y_test_orig])

#Train the model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000))
model.fit(X_train, y_train)

#Evaluate predictions
if len(y_test) > 0:
    y_pred = model.predict(X_test)
    test_labels_present = sorted(np.unique(y_test))
    mapped_classes = [job_classes[reverse_map[i]] for i in test_labels_present]

    report = classification_report(
        y_test, y_pred,
        labels=test_labels_present,
        target_names=mapped_classes,
        zero_division=0
    )
    print("\nClassification Report:\n")
    print(report)
else:
    print("\nWarning: No valid job labels in the test set after mapping. Try increasing dataset size.\n")

#View sample data
df['job_encoded'] = y
print("\nSample Data:\n")
print(df[['employee_id', 'skills', 'job', 'job_encoded']].head(50))


Classification Report:

                          precision    recall  f1-score   support

            Data Analyst       1.00      1.00      1.00         2
          Credit Analyst       1.00      1.00      1.00         4
            Loan Officer       0.50      1.00      0.67         2
      Compliance Analyst       1.00      1.00      1.00         2
          Branch Manager       0.17      1.00      0.29         1
             Bank Teller       0.00      0.00      0.00         2
            Risk Manager       1.00      0.50      0.67         6
      Investment Analyst       1.00      0.60      0.75         5
              BI Analyst       1.00      1.00      1.00         3
              IT Auditor       1.00      1.00      1.00         4
      Fraud Investigator       1.00      1.00      1.00         1
       Financial Advisor       1.00      1.00      1.00         3
        Treasury Analyst       1.00      1.00      1.00         5
      Operations Analyst       1.00      1.00     