In [1]:
MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_RUN_ID = "a8fb8540585f471ab7bf110e5a28229a"

LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRIC_PKL = "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [2]:
import pandas as pd

import os 
import pickle
import yaml

import mlflow
from mlflow.tracking import MlflowClient

---
# Intilaize:
## Mlflow:

In [3]:
# Intialize client
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [4]:
run = mlflow.get_run(MLFLOW_RUN_ID)
artifacts_path = run.info.artifact_uri

In [5]:
artifacts_path

'../models/mlruns/0/a8fb8540585f471ab7bf110e5a28229a/artifacts'

### Load artifacts

In [6]:
# Load model
model_path = os.path.join(artifacts_path, LOG_MODEL_PKL)
with open(model_path, 'rb') as file:
    model = pickle.load(file)

In [11]:
model.keys()

dict_keys(['model_description', 'model_details', 'model_object'])

In [8]:
# Load data pickle
data_path = os.path.join(artifacts_path, LOG_DATA_PKL)
with open(data_path, 'rb') as file:
    data = pickle.load(file)

In [9]:
data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [10]:
# Unpack variables:
features_names = pd.Series(data["features_names"])
targets_names = pd.Series(data["targets_names"])
classifier = model["model_object"]

---
## Load skills Clusters

In [22]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, 'r') as stream:
    cluster_config = yaml.safe_load(stream)

print([print(cluster, skills) for cluster, skills in cluster_config.items()])

skills_group_0 ['PHP', 'SQL', 'MariaDB', 'MySQL', 'SQLite', 'Drupal', 'Laravel', 'Symfony', 'Vue.js']
skills_group_1 ['Scala', 'Cassandra', 'Couchbase', 'Apache Spark', 'Hadoop']
skills_group_10 ['Python', 'Django', 'FastAPI', 'Flask']
skills_group_11 ['Assembly', 'C', 'C++', 'Qt']
skills_group_12 ['Julia', 'Matlab', 'R', 'NumPy', 'Pandas']
skills_group_13 ['Svelte', 'Deno']
skills_group_14 ['Ruby', 'Ruby on Rails']
skills_group_15 ['APL', 'COBOL', 'Crystal', 'Delphi', 'IBM DB2']
skills_group_16 ['HTML/CSS', 'JavaScript', 'jQuery']
skills_group_17 ['Keras', 'TensorFlow', 'Torch/PyTorch']
skills_group_18 ['Bash/Shell', 'Perl']
skills_group_19 ['Objective-C', 'Swift']
skills_group_2 ['DynamoDB', 'Ansible', 'Chef', 'Flow', 'Pulumi', 'Puppet', 'Terraform']
skills_group_20 ['Elixir', 'Erlang']
skills_group_21 ['TypeScript', 'Angular', 'Angular.js', 'Cordova']
skills_group_3 ['Node.js', 'MongoDB', 'Express', 'Gatsby', 'React.js', 'React Native', 'Yarn']
skills_group_4 ['Go', 'Elasticsearch',

In [24]:
# Reformat into dataframe:
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in cluster_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,PHP
1,skills_group_0,SQL
2,skills_group_0,MariaDB
3,skills_group_0,MySQL
4,skills_group_0,SQLite
...,...,...
92,skills_group_8,Haskell
93,skills_group_8,LISP
94,skills_group_8,Rust
95,skills_group_9,Unity 3D


## Predict sample entry

In [65]:
sample_skills = ["Unreal Engine", "Unity 3D"]#, "Hadoop", "Python"]

In [66]:
# Verify that these skills existis as features
pd.Series(sample_skills).isin(features_names)

0    True
1    True
dtype: bool

### 1. Recreate cluster features

In [67]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,PHP,False
1,skills_group_0,SQL,False
2,skills_group_0,MariaDB,False
3,skills_group_0,MySQL,False
4,skills_group_0,SQLite,False
...,...,...,...
92,skills_group_8,Haskell,False
93,skills_group_8,LISP,False
94,skills_group_8,Rust,False
95,skills_group_9,Unity 3D,True


In [68]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_19    0
skills_group_2     0
skills_group_20    0
skills_group_21    0
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     2
Name: sample_skills, dtype: int64

### 2. Create OneHotEncoded skills:

In [69]:
# First take only the skills columns; we drop only cluster_features from the overall features_names
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [70]:
# Now using the skills_names we can construct a pandas series;
# that contains all skills with 1 for skills in the sample skills
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(),
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

### 3. Combine features

In [71]:
# Concat 
features = pd.concat([ohe_skills, cluster_features])

In [72]:
# Sort columns according to features_names
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    2
Length: 147, dtype: int64

### 4. Predict

In [73]:
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.0s finished


[array([[0.912, 0.088]]),
 array([[0.952, 0.048]]),
 array([[0.972, 0.028]]),
 array([[0.996, 0.004]]),
 array([[1., 0.]]),
 array([[0.9648, 0.0352]]),
 array([[0.942, 0.058]]),
 array([[0.874, 0.126]]),
 array([[0.98, 0.02]]),
 array([[0.828, 0.172]]),
 array([[0.872, 0.128]]),
 array([[0.4, 0.6]]),
 array([[0.844, 0.156]]),
 array([[0.932, 0.068]]),
 array([[0.9112, 0.0888]]),
 array([[0.94, 0.06]])]

In [74]:
# Now print the props sorted and indexed with target names
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs,
          index=targets_names).sort_values(ascending=False)

Developer, game or graphics                      0.6000
Developer, front-end                             0.1720
Developer, mobile                                0.1560
Developer, full-stack                            0.1280
Developer, desktop or enterprise applications    0.1260
Scientist                                        0.0888
Academic researcher                              0.0880
Engineer, data                                   0.0680
System administrator                             0.0600
Developer, back-end                              0.0580
Data or business analyst                         0.0480
Developer, QA or test                            0.0352
Data scientist or machine learning specialist    0.0280
Developer, embedded applications or devices      0.0200
Database administrator                           0.0040
DevOps specialist                                0.0000
dtype: float64