In [1]:
# Constants
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "366b3b40b46344edb0a8d00c95a3884c"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../Data/Processed/3_skills_clusters.yaml"

In [2]:
#Load Packages
import os 
import sklearn
import pickle
import yaml
from pprint import pprint,PrettyPrinter

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

____

### Intialize MLflow

In [3]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artifact_path = run.info.artifact_uri.replace("file:///", "")

### Load Data and Model

In [4]:
# Load model
model_path = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_path, "rb") as handle:
    model = pickle.load(handle)

model

{'model_description': 'Random Forest, Tuned, multilabel, Data resampled',
 'model_details': "Pipeline(steps=[('randomforestclassifier',\n                 RandomForestClassifier(class_weight='balanced', max_depth=40,\n                                        max_features='log2',\n                                        min_samples_split=3, n_estimators=120,\n                                        n_jobs=-1, random_state=42))])",
 'model_object': Pipeline(steps=[('randomforestclassifier',
                  RandomForestClassifier(class_weight='balanced', max_depth=40,
                                         max_features='log2',
                                         min_samples_split=3, n_estimators=120,
                                         n_jobs=-1, random_state=42))])}

In [5]:
# Load data pkl
data_path  = os.path.join(artifact_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_set', 'test_indices', 'features_names', 'targets_names'])

In [6]:
# Get the Classifier, Features ,and Target names
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [7]:
classifier

____

### Load Skills Clusters

In [8]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as handle:
    clusters_config = yaml.safe_load(handle)
pprint(clusters_config)

{'skills_group_0': ['Hugging Face Transformers',
                    'Keras',
                    'NumPy',
                    'Pandas',
                    'Scikit-learn',
                    'TensorFlow',
                    'Torch/PyTorch',
                    'IPython/Jupyter',
                    'Spyder'],
 'skills_group_1': ['PHP',
                    'MariaDB',
                    'MySQL',
                    'OVH',
                    'Drupal',
                    'Laravel',
                    'Symfony',
                    'jQuery',
                    'PhpStorm',
                    'Webstorm'],
 'skills_group_10': ['IBM DB2',
                     'Oracle',
                     'IBM Cloud or Watson',
                     'Oracle Cloud Infrastructure'],
 'skills_group_11': ['HTML/CSS',
                     'JavaScript',
                     'TypeScript',
                     'React.js',
                     'Yarn',
                     'npm',
                     'Visual Stu

In [9]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Hugging Face Transformers
1,skills_group_0,Keras
2,skills_group_0,NumPy
3,skills_group_0,Pandas
4,skills_group_0,Scikit-learn
...,...,...
159,skills_group_8,SAS
160,skills_group_9,Julia
161,skills_group_9,R
162,skills_group_9,Tidyverse


 _____

### Create a Sample Entry

In [10]:
sample_skills = ['NumPy', 'Pandas', 'Python','TensorFlow','Torch/PyTorch','Git','Docker','Scikit-learn','SQL']

In [11]:
# Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
3    True
4    True
5    True
6    True
7    True
8    True
dtype: bool

### Recreate Cluster Features

In [12]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,Hugging Face Transformers,False
1,skills_group_0,Keras,False
2,skills_group_0,NumPy,True
3,skills_group_0,Pandas,True
4,skills_group_0,Scikit-learn,True
...,...,...,...
159,skills_group_8,SAS,False
160,skills_group_9,Julia,False
161,skills_group_9,R,False
162,skills_group_9,Tidyverse,False


In [13]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     5
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    2
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_19    0
skills_group_2     0
skills_group_20    0
skills_group_21    0
skills_group_22    0
skills_group_23    0
skills_group_24    1
skills_group_25    0
skills_group_26    0
skills_group_27    0
skills_group_28    0
skills_group_29    0
skills_group_3     0
skills_group_30    0
skills_group_31    0
skills_group_32    0
skills_group_33    0
skills_group_34    0
skills_group_35    0
skills_group_36    0
skills_group_37    0
skills_group_4     0
skills_group_5     1
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

### Create OneHotEncoded Skills

In [14]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     Git
1               Mercurial
2                     SVN
3                     APL
4                Assembly
              ...        
159                   Vim
160         Visual Studio
161    Visual Studio Code
162              Webstorm
163                 Xcode
Length: 164, dtype: object

In [15]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

Git                   1
Mercurial             0
SVN                   0
APL                   0
Assembly              0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 164, dtype: int64

### Combine Features

In [16]:
# Concat
features = pd.concat([ohe_skills,
                      cluster_features])
features

Git               1
Mercurial         0
SVN               0
APL               0
Assembly          0
                 ..
skills_group_5    1
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 202, dtype: int64

In [17]:
# Sort columns
features = features.loc[features_names]
features =pd.DataFrame([features.values],columns=features.index)
features

Unnamed: 0,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,...,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


### Job Prediction

In [18]:
predictions = classifier.predict_proba(features)
predictions

[array([[0.49871374, 0.50128626]]),
 array([[0.88151233, 0.11848767]]),
 array([[0.86751218, 0.13248782]]),
 array([[0.93592567, 0.06407433]]),
 array([[0.97730389, 0.02269611]]),
 array([[1., 0.]]),
 array([[0.99391257, 0.00608743]]),
 array([[1., 0.]]),
 array([[0.98350519, 0.01649481]]),
 array([[0.99170629, 0.00829371]]),
 array([[0.96443279, 0.03556721]]),
 array([[0.66255433, 0.33744567]]),
 array([[0.98364706, 0.01635294]]),
 array([[1., 0.]]),
 array([[0.979168, 0.020832]]),
 array([[0.99180332, 0.00819668]]),
 array([[0.96904925, 0.03095075]]),
 array([[0.98605316, 0.01394684]]),
 array([[0.59448137, 0.40551863]])]

In [19]:
# return positive probabilities
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)

Data scientist or machine learning specialist    0.501286
Academic researcher                              0.405519
Scientist                                        0.337446
Data or business analyst                         0.132488
Engineer, data                                   0.118488
Developer, back-end                              0.064074
System administrator                             0.035567
Developer, desktop or enterprise applications    0.030951
Database administrator                           0.022696
Developer, front-end                             0.020832
Developer, embedded applications or devices      0.016495
Security professional                            0.016353
DevOps specialist                                0.013947
Developer, QA or test                            0.008294
Blockchain                                       0.008197
Developer, full-stack                            0.006087
Cloud infrastructure engineer                    0.000000
Developer, mob