Combines employee skills and role/department proficiency data into one dataset.
Uses job title, department, and location as text features.
Applies TF-IDF vectorization for text and trains a Logistic Regression classifier.
Outputs accuracy and a classification report.
Predicts the skill for a new job description.

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

#dummies
employees = pd.DataFrame([
    [1, 'China Mieville', 'Data Analyst', 'Analytics'],
    [2, 'Thomas Pynchon', 'Senior AI Skills Analyst', 'Analytics'],
    [3, 'Don DeLillo', 'L&D Specialist', 'Learning'],
    [4, 'Cormac McCarthy', 'AI Engineer', 'AI Strategy'],
    [5, 'Laird Barron', 'Cybersecurity Specialist', 'IT'],
    [6, 'Kelly Link', 'HR Manager', 'Human Resources'],
    [7, 'Virginia Woolf', 'Data Scientist', 'Analytics'],
    [8, 'David Wallace', 'BI Analyst', 'Analytics'],
    [9, 'Thomas Ligotti', 'Security Analyst', 'IT'],
    [10,'Philip Roth', 'Recruitment Lead', 'Human Resources'],
    [11, 'Toni Morrison', 'Data Analyst', 'Analytics'],
    [12, 'James Joyce', 'Senior AI Skills Analyst', 'AI Strategy'],
    [13, 'William Faulkner', 'L&D Specialist', 'Learning'],
    [14, 'Willa Cather', 'AI Engineer', 'AI Strategy'],
    [15, 'Peter Straub', 'Cybersecurity Specialist', 'IT'],
    [16, 'Matthew Bartlett', 'HR Manager', 'Human Resources'],
    [17, 'Jon Padgett', 'Data Scientist', 'Analytics'],
    [18, 'Michael Cisco', 'BI Analyst', 'Analytics'],
    [19, 'Alan Smithee', 'Security Analyst', 'IT'],
    [20,'Martin Scorsese', 'Recruitment Lead', 'Human Resources'],
    [21, 'William Shakespeare', 'Data Analyst', 'Analytics'],
    [22, 'W.B. Yeats', 'Senior AI Skills Analyst', 'AI Strategy'],
    [23, 'T.S. Eliot', 'L&D Specialist', 'Learning'],
    [24, 'Ford Maddox Ford', 'AI Engineer', 'AI Strategy'],
    [25, 'Homer Homer', 'Cybersecurity Specialist', 'IT'],
    [26, 'John Crowley', 'HR Manager', 'Human Resources'],
    [27, 'Ben Lerner', 'Data Scientist', 'Analytics'],
    [28, 'George Saunders', 'BI Analyst', 'Analytics'],
    [29, 'H.P. Lovecraft', 'Security Analyst', 'IT'],
    [30,'Ramsay Campbell', 'Recruitment Lead', 'Human Resources'],
    [31, 'Stephen King', 'Data Analyst', 'Analytics'],
    [32, 'Joyce Carol Oates', 'Senior AI Skills Analyst', 'AI Strategy'],
    [33, 'John Berryman', 'L&D Specialist', 'Learning'],
    [34, 'John Langan', 'AI Engineer', 'AI Strategy'],
    [35, 'Shirley Jackson', 'Cybersecurity Specialist', 'IT'],
    [36, 'Mary Shelley', 'HR Manager', 'Human Resources'],
    [37, 'Thomas Hardy', 'Data Scientist', 'Analytics'],
    [38, 'D.H. Lawrence', 'BI Analyst', 'Analytics'],
    [39, 'Mark Twain', 'Security Analyst', 'IT'],
    [40, 'Jack London', 'Recruitment Lead', 'Human Resources'],
    [41, 'Charles Dickens', 'Data Analyst', 'Analytics'],
    [42, 'George Eliot', 'Senior AI Skills Analyst', 'Analytics'],
    [43, 'Nicholas Cage', 'L&D Specialist', 'Learning'],
    [44, 'P.T. Anderson', 'AI Engineer', 'AI Strategy'],
    [45, 'David Lynch', 'Cybersecurity Specialist', 'IT'],
    [46, 'Nathan Fielder', 'HR Manager', 'Human Resources'],
    [47, 'Heronymus Bosch', 'Data Scientist', 'Analytics'],
    [48, 'Salvador Dali', 'BI Analyst', 'Analytics'],
    [49, 'Pablo Picasso', 'Security Analyst', 'IT'],
    [50,'Teddy Roosevelt', 'Recruitment Lead', 'Human Resources'],
], columns=['employee_id', 'name', 'job_title', 'department'])

#skills
employee_skills = [
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment'],
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment'],
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment'],
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment'],
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment']
]

data = pd.DataFrame({
    'role_department': employees['job_title'] + " " + employees['department'],
    'skills': employee_skills
})

data.head()

Unnamed: 0,role_department,skills
0,Data Analyst Analytics,"[Python, Data Visualization]"
1,Senior AI Skills Analyst Analytics,"[Machine Learning, Python]"
2,L&D Specialist Learning,[Data Visualization]
3,AI Engineer AI Strategy,"[Machine Learning, Python]"
4,Cybersecurity Specialist IT,[Cybersecurity]


In [5]:
#encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

X_train, X_test, y_train, y_test = train_test_split(data['role_department'], y, test_size=0.2)

#pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])

model.fit(X_train, y_train)

y_pred_proba = model.predict_proba(X_test)


threshold = 0.35
y_pred = (y_pred_proba >= threshold).astype(int)

#evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(
    y_test,
    y_pred,
    target_names=mlb.classes_,
    zero_division=0
))

#prediction for new job
new_job = ["Data Analyst"]
new_job_proba = model.predict_proba(new_job)
new_job_pred = (new_job_proba >= threshold).astype(int)
predicted_skills = mlb.inverse_transform(new_job_pred)
print("\nPredicted skills for new job:", predicted_skills[0])

employee_id = 5
current_skills = employee_skills[employee_id - 1]

#compare skills
missing_skills = [skill for skill in predicted_skills[0] if skill not in current_skills]

print(f"\nEmployee: {employees.loc[employee_id-1, 'name']}")
print("Current Skills:", current_skills)
print("Predicted Skills for New Job:", predicted_skills[0])
print("Missing Skills:", missing_skills)

#course recommendations
skill_to_courses = {
    'Python': ['Intro to Python', 'Advanced Python Programming'],
    'Data Visualization': ['Data Visualization with Tableau', 'Matplotlib & Seaborn Masterclass'],
    'Machine Learning': ['Machine Learning Basics', 'Applied ML with Scikit-Learn'],
    'Cybersecurity': ['Cybersecurity Fundamentals', 'Network Security Essentials'],
    'Recruitment': ['Recruitment Strategies', 'HR Analytics']
}

recommended_courses = []
for skill in missing_skills:
    recommended_courses.extend(skill_to_courses.get(skill, []))

print("\nRecommended Courses for missing skills:", recommended_courses)

Accuracy: 0.7

Classification Report:
                     precision    recall  f1-score   support

     Cybersecurity       1.00      1.00      1.00         3
Data Visualization       0.25      1.00      0.40         1
  Machine Learning       1.00      1.00      1.00         4
            Python       0.57      1.00      0.73         4
       Recruitment       1.00      1.00      1.00         2

         micro avg       0.70      1.00      0.82        14
         macro avg       0.76      1.00      0.83        14
      weighted avg       0.82      1.00      0.88        14
       samples avg       0.80      1.00      0.85        14


Predicted skills for new job: ('Data Visualization', 'Python')

Employee: Laird Barron
Current Skills: ['Cybersecurity']
Predicted Skills for New Job: ('Data Visualization', 'Python')
Missing Skills: ['Data Visualization', 'Python']

Recommended Courses for missing skills: ['Data Visualization with Tableau', 'Matplotlib & Seaborn Masterclass', 'Intro to P