In [1]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "366b3b40b46344edb0a8d00c95a3884c"

CLUSTERS_YAML_PATH = "../Data/Processed/3_skills_clusters.yaml"

In [2]:
import pandas as pd
import sys
from pprint import pprint

sys.path.append('../')
from Scripts.JobPrediciton import *

### Predict Job probabilities

In [3]:
# Initialize model
model = JobPrediction(MLFLOW_TRACKING_URI, MLFLOW_RUN_ID, CLUSTERS_YAML_PATH)

In [4]:
# Entry skills
entry_skills = ['SQL','Python','Scikit-learn']

### Recommend Additional Skills

In [6]:
# Get possible new skills
all_skills = pd.Series(model.get_all_skills())
new_skills = all_skills[~all_skills.isin(entry_skills)].copy()
new_skills

0                 Git
1           Mercurial
2                 SVN
3                 APL
4            Assembly
            ...      
197    skills_group_5
198    skills_group_6
199    skills_group_7
200    skills_group_8
201    skills_group_9
Length: 199, dtype: object

In [7]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as handle:
    clusters_config = yaml.safe_load(handle)
pprint(clusters_config)

{'skills_group_0': ['Hugging Face Transformers',
                    'Keras',
                    'NumPy',
                    'Pandas',
                    'Scikit-learn',
                    'TensorFlow',
                    'Torch/PyTorch',
                    'IPython/Jupyter',
                    'Spyder'],
 'skills_group_1': ['PHP',
                    'MariaDB',
                    'MySQL',
                    'OVH',
                    'Drupal',
                    'Laravel',
                    'Symfony',
                    'jQuery',
                    'PhpStorm',
                    'Webstorm'],
 'skills_group_10': ['IBM DB2',
                     'Oracle',
                     'IBM Cloud or Watson',
                     'Oracle Cloud Infrastructure'],
 'skills_group_11': ['HTML/CSS',
                     'JavaScript',
                     'TypeScript',
                     'React.js',
                     'Yarn',
                     'npm',
                     'Visual Stu

In [16]:
simulated_results = []
for skill in new_skills:
    # get job_probs of each new skill
    additional_skill_prob = model.predict_job_probabilities([skill] + entry_skills)
    # get the difference in probability
    additional_skill_uplift = (additional_skill_prob - base_predictions) / base_predictions
    additional_skill_uplift.name = skill
    simulated_results.append(additional_skill_uplift)

In [17]:
simulated_results = pd.DataFrame(simulated_results)
simulated_results

Unnamed: 0,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher
Git,0.179572,0.002782,-1.388673e-01,0.654109,2.100855e-01,0.501414,0.184250,inf,-5.650801e-02,0.207404,-0.283728,2.545941e-01,0.232797,0.435108,0.588974,1.367865,-3.111922e-01,,0.039384
Mercurial,-0.165168,-0.310486,-2.672038e-02,0.383328,6.581483e-01,-0.326419,0.209347,inf,-6.773337e-02,0.623680,0.170945,-7.666203e-02,0.002979,0.797706,0.665235,0.584547,1.745232e-02,inf,0.031935
SVN,-0.237851,0.025122,6.943206e-02,0.092100,2.209932e-01,-0.597305,-0.098464,,3.802746e-01,0.348559,-0.028921,-9.531162e-02,-0.977758,-0.300414,-1.000000,0.015350,7.013329e-01,,-0.165841
APL,-0.169678,-0.318262,5.087400e-02,0.373371,6.536191e-01,-0.331270,0.121382,inf,2.716302e-01,0.419967,0.191358,-1.731676e-01,0.025221,1.128070,0.665235,0.584547,-1.016931e-01,,0.100633
Assembly,-0.330466,-0.331770,-2.655598e-01,0.143491,-2.144591e-01,-0.053182,-0.044873,,1.211117e+00,0.626107,-0.205833,-1.116538e-01,0.011114,1.850286,2.073243,1.007848,6.912335e-01,,0.156043
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
skills_group_5,0.000000,0.000000,0.000000e+00,0.000000,-1.417336e-16,0.000000,0.000000,,1.430302e-16,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,-1.465204e-16,,0.000000
skills_group_6,0.000000,0.000000,-1.210319e-16,0.000000,-1.417336e-16,0.000000,0.000000,,1.430302e-16,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,-1.465204e-16,,0.000000
skills_group_7,0.000000,0.000000,0.000000e+00,0.000000,-1.417336e-16,0.000000,0.000000,,1.430302e-16,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,0.000000,-1.465204e-16,,0.000000
skills_group_8,0.000000,0.000000,0.000000e+00,0.000000,-1.417336e-16,0.000000,0.000000,,1.430302e-16,0.000000,0.000000,-2.786818e-16,0.000000,0.000000,0.000000,0.000000,-1.465204e-16,,0.000000


In [18]:
target_job = 'Data scientist or machine learning specialist'

In [19]:
target_results = simulated_results[target_job].sort_values(ascending=False)
target_results.head(30)

PyCharm                      0.825358
IPython/Jupyter              0.477927
Hadoop                       0.419609
Apache Spark                 0.304274
R                            0.250101
Torch/PyTorch                0.241399
TensorFlow                   0.213591
Tidyverse                    0.210361
Hugging Face Transformers    0.199043
Play Framework               0.197466
Django                       0.190382
Git                          0.179572
Apache Kafka                 0.176673
Cassandra                    0.164549
FastAPI                      0.149674
Flask                        0.128673
Keras                        0.127764
Docker                       0.126407
Spyder                       0.124296
Pulumi                       0.116398
Pandas                       0.107544
Neo4j                        0.096752
RStudio                      0.054669
Electron                     0.017742
SAS                          0.001598
skills_group_7               0.000000
skills_group

In [20]:
threshold = 0.10
recommendations = target_results[target_results > threshold].index.tolist()

In [21]:
print("Your current skills: " + str(entry_skills))
print("Your target job: " + str(target_job))
print("You might also consider learning: " + str(recommendations))

Your current skills: ['SQL', 'Python', 'Scikit-learn']
Your target job: Data scientist or machine learning specialist
You might also consider learning: ['PyCharm', 'IPython/Jupyter', 'Hadoop', 'Apache Spark', 'R', 'Torch/PyTorch', 'TensorFlow', 'Tidyverse', 'Hugging Face Transformers', 'Play Framework', 'Django', 'Git', 'Apache Kafka', 'Cassandra', 'FastAPI', 'Flask', 'Keras', 'Docker', 'Spyder', 'Pulumi', 'Pandas']


________________
### Test Function

In [5]:
# predict job probabilities
base_predictions = model.predict_job_probabilities(entry_skills)
base_predictions.sort_values(ascending=False)

Data scientist or machine learning specialist    0.231368
Data or business analyst                         0.229324
Scientist                                        0.199192
Engineer, data                                   0.138588
Academic researcher                              0.121054
Developer, back-end                              0.100103
Developer, desktop or enterprise applications    0.094716
System administrator                             0.087097
Developer, full-stack                            0.067761
Database administrator                           0.048957
Developer, embedded applications or devices      0.048513
Developer, QA or test                            0.043565
Developer, mobile                                0.039067
Security professional                            0.030201
Developer, game or graphics                      0.024912
Blockchain                                       0.008207
Developer, front-end                             0.008039
Cloud infrastr

In [6]:
skills_recommended = model.recommend_new_skills(entry_skills, target_job='Data scientist or machine learning specialist', threshold = 0.1)

In [None]:
skills_recommended

PyCharm                      0.825358
IPython/Jupyter              0.477927
Hadoop                       0.419609
Apache Spark                 0.304274
R                            0.250101
Torch/PyTorch                0.241399
TensorFlow                   0.213591
Tidyverse                    0.210361
Hugging Face Transformers    0.199043
Play Framework               0.197466
Django                       0.190382
Git                          0.179572
Apache Kafka                 0.176673
Cassandra                    0.164549
FastAPI                      0.149674
Flask                        0.128673
Keras                        0.127764
Docker                       0.126407
Spyder                       0.124296
Pulumi                       0.116398
Pandas                       0.107544
Name: Data scientist or machine learning specialist, dtype: float64