In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

#Dummies
employees = pd.DataFrame([
    [1, 'Bob Gray', 'Data Analyst', 'Analytics', 'Sandy, UT'],
    [2, 'Thomas Pynchon', 'Senior AI Skills Analyst', 'AI Strategy', 'Sandy, UT'],
    [3, 'Don DeliLlo', 'L&D Specialist', 'Learning', 'Salt Lake City, UT'],
    [4, 'Cormac McCarthy', 'AI Engineer', 'AI Strategy', 'Boston, MA'],
    [5, 'Laird Barron', 'Cybersecurity Specialist', 'IT', 'Seattle, WA'],
    [6, 'Kelly Link', 'HR Manager', 'Human Resources', 'Chicago, IL'],
    [7, 'Virginia Woolf', 'Data Scientist', 'Analytics', 'New York, NY'],
    [8, 'David Wallace', 'BI Analyst', 'Analytics', 'Denver, CO'],
    [9, 'Thomas Ligotti', 'Security Analyst', 'IT', 'Austin, TX'],
    [10, 'Philip Roth', 'Recruitment Lead', 'Human Resources', 'Phoenix, AZ']
], columns=['employee_id', 'name', 'job_title', 'department', 'location'])

#Skills
employee_skills = [
    ['Python', 'Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Data Visualization'],
    ['Machine Learning', 'Python'],
    ['Cybersecurity'],
    ['Recruitment'],
    ['Python', 'Machine Learning'],
    ['Data Visualization', 'Python'],
    ['Cybersecurity', 'Python'],
    ['Recruitment']
]

data = pd.DataFrame({
    'features': employees['job_title'] + " " + employees['department'] + " " + employees['location'],
    'skills': employee_skills
})

#Encoding
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(data['skills'])

X_train, X_test, y_train, y_test = train_test_split(data['features'], y, test_size=0.3, random_state=42)

#Pipeline
model = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', OneVsRestClassifier(LogisticRegression(max_iter=1000)))
])

#Training
model.fit(X_train, y_train)

# Predictions
y_pred_proba = model.predict_proba(X_test)

# Lower threshold from 0.5 to 0.3
threshold = 0.3
y_pred = (y_pred_proba >= threshold).astype(int)

#Evaluation
print("Accuracy (subset):", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(
    y_test,
    y_pred,
    target_names=mlb.classes_,
    zero_division=0
))

# Prediction for new job
new_job = ["Data Analyst"]
new_job_proba = model.predict_proba(new_job)
new_job_pred = (new_job_proba >= threshold).astype(int)
predicted_skills = mlb.inverse_transform(new_job_pred)
print("\nPredicted Skills for new job:", predicted_skills[0])

#Course Recommendations
# Map skills to course
skill_to_courses = {
    'Python': ['Intro to Python', 'Advanced Python Programming'],
    'Data Visualization': ['Data Visualization with Tableau', 'Matplotlib & Seaborn Masterclass'],
    'Machine Learning': ['Machine Learning Basics', 'Applied ML with Scikit-Learn'],
    'Cybersecurity': ['Cybersecurity Fundamentals', 'Network Security Essentials'],
    'Recruitment': ['Recruitment Strategies', 'HR Analytics']
}

# Recommend courses based on predicted skills
recommended_courses = []
for skill in predicted_skills[0]:
    recommended_courses.extend(skill_to_courses.get(skill, []))

print("\nRecommended Courses:", recommended_courses)

Accuracy (subset): 0.0

Classification Report:
                     precision    recall  f1-score   support

     Cybersecurity       0.00      0.00      0.00         1
Data Visualization       0.00      0.00      0.00         0
  Machine Learning       1.00      1.00      1.00         1
            Python       0.67      1.00      0.80         2
       Recruitment       0.00      0.00      0.00         1

         micro avg       0.43      0.60      0.50         5
         macro avg       0.33      0.40      0.36         5
      weighted avg       0.47      0.60      0.52         5
       samples avg       0.39      0.50      0.43         5


Predicted Skills for new job: ('Data Visualization', 'Python')

Recommended Courses: ['Data Visualization with Tableau', 'Matplotlib & Seaborn Masterclass', 'Intro to Python', 'Advanced Python Programming']


Combines employee, skills, and proficiency data into one dataset.
Uses job title, department, and location as text features.
Applies TF-IDF vectorization for text and trains a Logistic Regression classifier.
Outputs accuracy and a classification report.
Predicts the skill for a new job description.