In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report

#dummy data
bank_jobs_skills = {
    "Data Analyst": [["Python", "Data Analysis"], ["Excel", "SQL"], ["Power BI", "Statistics"]],
    "Credit Analyst": [["Financial Analysis", "Credit Risk"], ["Underwriting", "Excel"], ["Loan Review", "Risk Assessment"]],
    "Loan Officer": [["Customer Service", "Loan Origination"], ["Credit Reports", "Regulatory Knowledge"], ["Loan Processing", "CRM"]],
    "Compliance Analyst": [["Regulatory Compliance", "AML"], ["KYC", "Audit"], ["Policy Review", "Banking Laws"]],
    "Branch Manager": [["Leadership", "Sales"], ["Branch Operations", "Customer Experience"], ["Staff Training", "Team Management"]],
    "Bank Teller": [["Cash Handling", "Customer Service"], ["POS Systems", "Transaction Processing"], ["Fraud Detection", "Bank Procedures"]],
    "Risk Manager": [["Risk Modeling", "Market Risk"], ["Operational Risk", "Stress Testing"], ["Scenario Analysis", "Compliance"]],
    "Investment Analyst": [["Equity Research", "Valuation"], ["Portfolio Management", "Bloomberg"], ["Risk-Return Analysis", "Financial Modeling"]],
    "BI Analyst": [["Power BI", "Data Visualization"], ["Excel", "SQL"], ["Tableau", "Business Intelligence"]],
    "IT Auditor": [["Internal Controls", "IT Security"], ["Audit Tools", "Compliance Testing"], ["Access Management", "Risk Auditing"]],
    "Fraud Investigator": [["Transaction Monitoring", "Fraud Detection"], ["Forensics", "Surveillance"], ["Case Review", "Reporting"]],
    "Financial Advisor": [["Investment Strategy", "Client Communication"], ["Retirement Planning", "Wealth Management"], ["Portfolio Review", "Risk Tolerance"]],
    "Treasury Analyst": [["Cash Management", "Liquidity Planning"], ["Bank Transfers", "Forecasting"], ["Treasury Operations", "Funding"]],
    "Operations Analyst": [["Process Improvement", "Workflow Analysis"], ["Data Entry", "Reporting"], ["Bank Ops", "Compliance Checks"]],
    "Mortgage Underwriter": [["Credit Reports", "Loan-to-Value"], ["Income Verification", "DTI"], ["Appraisals", "Underwriting Guidelines"]],
    "Internal Auditor": [["Internal Controls", "Risk Assessment"], ["Audit Planning", "Compliance Testing"], ["Documentation", "Reporting"]],
    "AML Analyst": [["Anti-Money Laundering", "SAR Filing"], ["KYC Review", "Transaction Patterns"], ["Customer Due Diligence", "Risk Scoring"]],
    "Product Manager": [["Market Research", "Product Development"], ["Roadmapping", "Stakeholder Communication"], ["KPIs", "Customer Feedback"]],
    "Customer Success Manager": [["Client Retention", "Onboarding"], ["CRM", "Upselling"], ["Customer Satisfaction", "Product Adoption"]],
    "Cybersecurity Analyst": [["Intrusion Detection", "SIEM"], ["Firewalls", "Incident Response"], ["Threat Intelligence", "Endpoint Security"]],
}

#Generate 10 dummy samples per job title
expanded_data = []
id_counter = 1
for job, skill_sets in bank_jobs_skills.items():
    for i in range(20):
        skill_set = random.choice(skill_sets)
        expanded_data.append({
            "employee_id": id_counter,
            "skills": skill_set,
            "job": job
        })
        id_counter += 1

df = pd.DataFrame(expanded_data)


#Encode skills and job labels
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['skills'])

df['job_label'], job_classes = pd.factorize(df['job'])
y = df['job_label']

#Train-test split
X_train, X_test, y_train_orig, y_test_orig = train_test_split(X, y, test_size=0.3, random_state=42)


#Remap labels for compatibility
unique_train_labels = np.unique(y_train_orig)
label_map = {old: new for new, old in enumerate(unique_train_labels)}
reverse_map = {v: k for k, v in label_map.items()}

y_train = np.array([label_map[label] for label in y_train_orig])

valid_test_mask = [label in label_map for label in y_test_orig]
X_test = X_test[valid_test_mask]
y_test_orig = y_test_orig[valid_test_mask]
y_test = np.array([label_map[label] for label in y_test_orig])

#Train the model
model = OneVsRestClassifier(LogisticRegression(max_iter=2000))
model.fit(X_train, y_train)

#Evaluate predictions
if len(y_test) > 0:
    y_pred = model.predict(X_test)
    test_labels_present = sorted(np.unique(y_test))
    mapped_classes = [job_classes[reverse_map[i]] for i in test_labels_present]

    report = classification_report(
        y_test, y_pred,
        labels=test_labels_present,
        target_names=mapped_classes,
        zero_division=0
    )
    print("\nClassification Report:\n")
    print(report)
else:
    print("\nWarning: No valid job labels in the test set after mapping.\n")

#View sample data
df['job_encoded'] = y
print("\nSample Data:\n")
print(df[['employee_id', 'skills', 'job', 'job_encoded']].head(50))


Classification Report:

                          precision    recall  f1-score   support

            Data Analyst       0.71      0.56      0.62         9
          Credit Analyst       1.00      1.00      1.00         6
            Loan Officer       1.00      1.00      1.00         6
      Compliance Analyst       1.00      1.00      1.00        11
          Branch Manager       1.00      1.00      1.00         5
             Bank Teller       1.00      1.00      1.00         8
            Risk Manager       1.00      1.00      1.00         5
      Investment Analyst       1.00      1.00      1.00         6
              BI Analyst       0.20      0.33      0.25         3
              IT Auditor       1.00      1.00      1.00         3
      Fraud Investigator       1.00      1.00      1.00         3
       Financial Advisor       1.00      1.00      1.00         8
        Treasury Analyst       1.00      1.00      1.00         5
      Operations Analyst       1.00      1.00     

**The code creates a fake dataset where each bank job (like "Loan Officer") is paired with a set of skills. Then it converts these skill lists into numerical data, splits the data into training and testing groups, trains a logistic regression model to predict the job based on the skills, and finally prints out a report showing how well the model performs. Essentially, it simulates building and evaluating a job classification system based on employee skills.**

In [4]:
# Let's simulate data
import pandas as pd

#Create dummy employees dataset
employees_df = pd.DataFrame({
    'employee_id': [1, 2, 3, 4, 5],
    'job_title': ['Analyst', 'Analyst', 'Manager', 'Analyst', 'Manager'],
    'skills': ['Excel, SQL', 'Excel, Python', 'Leadership, Management', 'Excel, Communication', 'Management, Leadership']
})

#Create dummy courses taken dataset
dummy_courses = [
    {'employee_id': 1, 'course_id': 'C1'},
    {'employee_id': 1, 'course_id': 'C2'},
    {'employee_id': 2, 'course_id': 'C1'},
    {'employee_id': 2, 'course_id': 'C3'},
    {'employee_id': 4, 'course_id': 'C2'},
    {'employee_id': 4, 'course_id': 'C4'},
    {'employee_id': 5, 'course_id': 'C5'}
]
courses_df = pd.DataFrame(dummy_courses)

#Create dummy course catalog
dummy_catalog = [
    {'course_id': 'C1', 'course_name': 'Excel Basics', 'skills_covered': 'Excel'},
    {'course_id': 'C2', 'course_name': 'Advanced Excel', 'skills_covered': 'Advanced Excel'},
    {'course_id': 'C3', 'course_name': 'Python for Data Scientists', 'skills_covered': 'Python'},
    {'course_id': 'C4', 'course_name': 'Data Communication', 'skills_covered': 'Communication'},
    {'course_id': 'C5', 'course_name': 'Leadership 101', 'skills_covered': 'Leadership'}
]
course_catalog_df = pd.DataFrame(dummy_catalog)

#Create recommendation functions
def get_peers(employee_id, employees_df):
    job_title = employees_df.loc[employees_df['employee_id'] == employee_id, 'job_title'].values[0]
    peers = employees_df[employees_df['job_title'] == job_title]
    return peers


def get_peer_courses(peers, courses_df):
    peer_ids = peers['employee_id'].tolist()
    peer_courses = courses_df[courses_df['employee_id'].isin(peer_ids)]
    return peer_courses


def recommend_courses_skill_match(employee_id, employees_df, courses_df, course_catalog_df):
    peers = get_peers(employee_id, employees_df)
    peer_courses = get_peer_courses(peers, courses_df)
    
    employee_courses = courses_df[courses_df['employee_id'] == employee_id]['course_id'].tolist()
    # Filter courses that the employee hasn't taken yet and that are popular among peers
    recommended = peer_courses[~peer_courses['course_id'].isin(employee_courses)]
    
    # merge course catalog to get skills details for each course
    recommended = recommended.merge(course_catalog_df, on='course_id')
    
    # Obtain the employee's skill set as a Python set for easier checking
    employee_skills = set()
    for skill in employees_df.loc[employees_df['employee_id'] == employee_id, 'skills'].values[0].split(','):
        employee_skills.add(skill.strip())
    
    # Filter recommendations that cover skills the employee does not already have
    filter_func = lambda skills: any(s.strip() not in employee_skills for s in skills.split(','))
    recommended = recommended[recommended['skills_covered'].apply(filter_func)]
    
    # Count frequency of each course among peers
    course_counts = recommended['course_id'].value_counts().reset_index()
    course_counts.columns = ['course_id', 'count']
    
    # Merge with course catalog for names
    recommendations = course_counts.merge(course_catalog_df, on='course_id')
    
    # Add a column to show the skills not yet possessed by employee (covered by the course)
    def unmatched_skills(row):
        return ','.join([s.strip() for s in row['skills_covered'].split(',') if s.strip() not in employee_skills])
    
    recommendations['unmatched_skills'] = recommendations.apply(unmatched_skills, axis=1)
    
    return recommendations.sort_values(by='count', ascending=False)

# Simulate recommendations for a sample employee with id 1 using the extended function
extended_recommendations = recommend_courses_skill_match(1, employees_df, courses_df, course_catalog_df)

extended_recommendations[['course_id', 'course_name', 'count', 'unmatched_skills']]




Unnamed: 0,course_id,course_name,count,unmatched_skills
0,C3,Python for Data Scientists,1,Python
1,C4,Data Communication,1,Communication


In [4]:
import random

# Define job-skill mappings
bank_jobs_skills = {
    "Data Analyst": [["Python", "Data Analysis"], ["Excel", "SQL"], ["Power BI", "Statistics"]],
    "Credit Analyst": [["Financial Analysis", "Credit Risk"], ["Underwriting", "Excel"], ["Loan Review", "Risk Assessment"]],
    "Loan Officer": [["Customer Service", "Loan Origination"], ["Credit Reports", "Regulatory Knowledge"], ["Loan Processing", "CRM"]],
    "Compliance Analyst": [["Regulatory Compliance", "AML"], ["KYC", "Audit"], ["Policy Review", "Banking Laws"]],
    "Branch Manager": [["Leadership", "Sales"], ["Branch Operations", "Customer Experience"], ["Staff Training", "Team Management"]],
    "Bank Teller": [["Cash Handling", "Customer Service"], ["POS Systems", "Transaction Processing"], ["Fraud Detection", "Bank Procedures"]],
    "Risk Manager": [["Risk Modeling", "Market Risk"], ["Operational Risk", "Stress Testing"], ["Scenario Analysis", "Compliance"]],
    "Investment Analyst": [["Equity Research", "Valuation"], ["Portfolio Management", "Bloomberg"], ["Risk-Return Analysis", "Financial Modeling"]],
    "BI Analyst": [["Power BI", "Data Visualization"], ["Excel", "SQL"], ["Tableau", "Business Intelligence"]],
    "IT Auditor": [["Internal Controls", "IT Security"], ["Audit Tools", "Compliance Testing"], ["Access Management", "Risk Auditing"]],
    "Fraud Investigator": [["Transaction Monitoring", "Fraud Detection"], ["Forensics", "Surveillance"], ["Case Review", "Reporting"]],
    "Financial Advisor": [["Investment Strategy", "Client Communication"], ["Retirement Planning", "Wealth Management"], ["Portfolio Review", "Risk Tolerance"]],
    "Treasury Analyst": [["Cash Management", "Liquidity Planning"], ["Bank Transfers", "Forecasting"], ["Treasury Operations", "Funding"]],
    "Operations Analyst": [["Process Improvement", "Workflow Analysis"], ["Data Entry", "Reporting"], ["Bank Ops", "Compliance Checks"]],
    "Mortgage Underwriter": [["Credit Reports", "Loan-to-Value"], ["Income Verification", "DTI"], ["Appraisals", "Underwriting Guidelines"]],
    "Internal Auditor": [["Internal Controls", "Risk Assessment"], ["Audit Planning", "Compliance Testing"], ["Documentation", "Reporting"]],
    "AML Analyst": [["Anti-Money Laundering", "SAR Filing"], ["KYC Review", "Transaction Patterns"], ["Customer Due Diligence", "Risk Scoring"]],
    "Product Manager": [["Market Research", "Product Development"], ["Roadmapping", "Stakeholder Communication"], ["KPIs", "Customer Feedback"]],
    "Customer Success Manager": [["Client Retention", "Onboarding"], ["CRM", "Upselling"], ["Customer Satisfaction", "Product Adoption"]],
    "Cybersecurity Analyst": [["Intrusion Detection", "SIEM"], ["Firewalls", "Incident Response"], ["Threat Intelligence", "Endpoint Security"]],
}

# Dummy course catalog
dummy_courses = {
    "Python": "Intro to Python for Data Analysis",
    "Data Analysis": "Data Analysis Fundamentals",
    "Excel": "Mastering Excel for Business",
    "SQL": "SQL for Data Professionals",
    "Power BI": "Power BI for Beginners",
    "Statistics": "Statistics for Data Science",
    "Financial Analysis": "Financial Analysis Essentials",
    "Credit Risk": "Understanding Credit Risk",
    "Underwriting": "Loan Underwriting Basics",
    "Loan Review": "Loan Review Techniques",
    "Risk Assessment": "Risk Assessment Strategies",
    "Customer Service": "Customer Service Excellence",
    "Loan Origination": "Loan Origination Process",
    "Credit Reports": "Reading Credit Reports",
    "Regulatory Knowledge": "Banking Regulations 101",
    "Loan Processing": "Loan Processing Workflow",
    "CRM": "CRM Tools and Techniques",
    "Regulatory Compliance": "Regulatory Compliance Overview",
    "AML": "Anti-Money Laundering Training",
    "KYC": "Know Your Customer (KYC) Essentials",
    "Audit": "Introduction to Auditing",
    "Policy Review": "Policy Review and Implementation",
    "Banking Laws": "Banking Laws and Ethics",
    "Leadership": "Leadership Development",
    "Sales": "Sales Fundamentals",
    "Branch Operations": "Branch Operations Management",
    "Customer Experience": "Enhancing Customer Experience",
    "Staff Training": "Effective Staff Training",
    "Team Management": "Team Management Skills",
    "Cash Handling": "Cash Handling Procedures",
    "POS Systems": "POS System Training",
    "Transaction Processing": "Transaction Processing Basics",
    "Fraud Detection": "Fraud Detection Techniques",
    "Bank Procedures": "Banking Procedures Overview",
    "Risk Modeling": "Risk Modeling Techniques",
    "Market Risk": "Market Risk Management",
    "Operational Risk": "Managing Operational Risk",
    "Stress Testing": "Stress Testing in Finance",
    "Scenario Analysis": "Scenario Analysis Methods",
    "Compliance": "Compliance in Financial Services",
    "Equity Research": "Equity Research Fundamentals",
    "Valuation": "Valuation Techniques",
    "Portfolio Management": "Portfolio Management Strategies",
    "Bloomberg": "Using Bloomberg Terminal",
    "Risk-Return Analysis": "Risk and Return Analysis",
    "Financial Modeling": "Financial Modeling with Excel",
    "Data Visualization": "Data Visualization with Power BI",
    "Tableau": "Tableau for Data Analysis",
    "Business Intelligence": "Business Intelligence Concepts",
    "Internal Controls": "Internal Controls in IT",
    "IT Security": "IT Security Fundamentals",
    "Audit Tools": "Audit Tools and Techniques",
    "Compliance Testing": "Compliance Testing Procedures",
    "Access Management": "Access Management Systems",
    "Risk Auditing": "Risk-Based Auditing",
    "Transaction Monitoring": "Transaction Monitoring Systems",
    "Forensics": "Digital Forensics Basics",
    "Surveillance": "Surveillance Techniques",
    "Case Review": "Case Review and Documentation",
    "Reporting": "Effective Reporting Skills",
    "Investment Strategy": "Investment Strategy Planning",
    "Client Communication": "Client Communication Skills",
    "Retirement Planning": "Retirement Planning Essentials",
    "Wealth Management": "Wealth Management Strategies",
    "Portfolio Review": "Portfolio Review Techniques",
    "Risk Tolerance": "Assessing Risk Tolerance",
    "Cash Management": "Cash Management Techniques",
    "Liquidity Planning": "Liquidity Planning in Treasury",
    "Bank Transfers": "Bank Transfer Operations",
    "Forecasting": "Financial Forecasting",
    "Treasury Operations": "Treasury Operations Overview",
    "Funding": "Funding Strategies",
    "Process Improvement": "Process Improvement Tools",
    "Workflow Analysis": "Workflow Analysis Techniques",
    "Data Entry": "Accurate Data Entry",
    "Bank Ops": "Bank Operations Fundamentals",
    "Compliance Checks": "Compliance Checklists",
    "Loan-to-Value": "Understanding Loan-to-Value",
    "Income Verification": "Income Verification Methods",
    "DTI": "Debt-to-Income Ratio Analysis",
    "Appraisals": "Real Estate Appraisals",
    "Underwriting Guidelines": "Underwriting Guidelines Explained",
    "Audit Planning": "Audit Planning Process",
    "Documentation": "Documentation Best Practices",
    "Anti-Money Laundering": "AML Compliance Training",
    "SAR Filing": "Suspicious Activity Report Filing",
    "KYC Review": "KYC Review Procedures",
    "Transaction Patterns": "Analyzing Transaction Patterns",
    "Customer Due Diligence": "Customer Due Diligence Training",
    "Risk Scoring": "Risk Scoring Models",
    "Market Research": "Market Research Techniques",
    "Product Development": "Product Development Lifecycle",
    "Roadmapping": "Product Roadmapping",
    "Stakeholder Communication": "Stakeholder Communication Skills",
    "KPIs": "Key Performance Indicators",
    "Customer Feedback": "Collecting Customer Feedback",
    "Client Retention": "Client Retention Strategies",
    "Onboarding": "Customer Onboarding Process",
    "Upselling": "Upselling Techniques",
    "Customer Satisfaction": "Measuring Customer Satisfaction",
    "Product Adoption": "Driving Product Adoption",
    "Intrusion Detection": "Intrusion Detection Systems",
    "SIEM": "Security Information and Event Management",
    "Firewalls": "Firewall Configuration",
    "Incident Response": "Incident Response Planning",
    "Threat Intelligence": "Cyber Threat Intelligence",
    "Endpoint Security": "Endpoint Security Solutions"
}

# Skill gap analysis function
def recommend_skills_and_courses(target_job, user_skills):
    if target_job not in bank_jobs_skills:
        return f"Job title '{target_job}' not found in the database."

    # Flatten all skill sets for the job
    required_skills = set(skill for group in bank_jobs_skills[target_job] for skill in group)
    user_skills_set = set(user_skills)

    missing_skills = required_skills - user_skills_set

    recommendations = {}
    for skill in missing_skills:
        course = dummy_courses.get(skill, "No course available")
        recommendations[skill] = course

    return recommendations

# Example usage
target_job = "Data Analyst"
user_skills = ["Excel", "SQL"]

recommendations = recommend_skills_and_courses(target_job, user_skills)

print(f"\nTarget Job: {target_job}")
print(f"User Skills: {user_skills}")
print("\nRecommended Skills and Courses:")
for skill, course in recommendations.items():
    print(f"- {skill}: {course}")



Target Job: Data Analyst
User Skills: ['Excel', 'SQL']

Recommended Skills and Courses:
- Python: Intro to Python for Data Analysis
- Data Analysis: Data Analysis Fundamentals
- Statistics: Statistics for Data Science
- Power BI: Power BI for Beginners
