In [1]:
MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_RUN_ID = "d289e1d736954b17addda6c390b2f540"
EXP_ID = "117654367148467074"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRIC_PKL = "metrics.model"
CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [2]:
import os 
import sklearn
import pickle
import yaml
import pandas as pd
import mlflow
from mlflow.tracking import MlflowClient
from urllib.parse import urlparse

In [3]:
#intialize mlflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path  = run.info.artifact_uri


  return FileStore(store_uri, store_uri)


In [4]:
model_path = os.path.join(
    MLFLOW_TRACKING_URI, 
    EXP_ID, 
    MLFLOW_RUN_ID, 
    "artifacts", 
    LOG_MODEL_PKL
)

In [None]:
#load model
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [7]:
#load data
data_path = os.path.join(
    MLFLOW_TRACKING_URI, 
    EXP_ID, 
    MLFLOW_RUN_ID, 
    "artifacts", 
    LOG_DATA_PKL
)

In [8]:
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [9]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [10]:
classifier

In [11]:
#load skill clusters
with open(CLUSTERS_YAML_PATH ,'r') as stream:
    clusters_config = yaml.safe_load(stream)
clusters_config

{'skills_group_0': ['Go',
  'DynamoDB',
  'PostgreSQL',
  'AWS',
  'Ansible',
  'Chef',
  'Docker',
  'Flow',
  'Git',
  'Kubernetes',
  'Pulumi',
  'Puppet',
  'Terraform'],
 'skills_group_1': ['Scala',
  'Cassandra',
  'Couchbase',
  'Elasticsearch',
  'Redis',
  'DigitalOcean',
  'Apache Spark',
  'Hadoop'],
 'skills_group_10': ['Dart',
  'Kotlin',
  'Firebase',
  'SQLite',
  'Google Cloud Platform',
  'Flutter',
  'Android Studio'],
 'skills_group_11': ['Assembly', 'C', 'C++', 'Qt'],
 'skills_group_12': ['Node.js',
  'MongoDB',
  'Heroku',
  'Express',
  'Gatsby',
  'React.js',
  'React Native',
  'Yarn'],
 'skills_group_13': ['Unity 3D', 'Unreal Engine'],
 'skills_group_14': ['Ruby', 'Ruby on Rails', 'RubyMine', 'TextMate'],
 'skills_group_15': ['Svelte', 'Deno'],
 'skills_group_16': ['Bash/Shell', 'Perl', 'Vim'],
 'skills_group_17': ['Objective-C', 'Swift', 'Xcode'],
 'skills_group_18': ['TypeScript', 'Angular', 'Angular.js', 'Cordova'],
 'skills_group_19': ['Julia', 'R', 'RStudi

In [12]:
#reformat into dataframe
molten_clusters = [
    (cluster_name , cluster_skill) 
    for cluster_name , cluster_skills in clusters_config.items()
    for cluster_skill in cluster_skills
]

clusters_df = pd.DataFrame(molten_clusters,columns=['cluster_name' , 'skill'])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Go
1,skills_group_0,DynamoDB
2,skills_group_0,PostgreSQL
3,skills_group_0,AWS
4,skills_group_0,Ansible
...,...,...
120,skills_group_9,Haskell
121,skills_group_9,LISP
122,skills_group_9,Rust
123,skills_group_9,Emacs


In [26]:
#predict sample entry
sample_skills = ['Scala', 'Hadoop', 'Python']

In [27]:
#Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
dtype: bool

In [28]:
#1-Recreate cluster features
sample_clusters = clusters_df.copy()
sample_clusters['sample_skills'] = sample_clusters['skill'].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,Go,False
1,skills_group_0,DynamoDB,False
2,skills_group_0,PostgreSQL,False
3,skills_group_0,AWS,False
4,skills_group_0,Ansible,False
...,...,...,...
120,skills_group_9,Haskell,False
121,skills_group_9,LISP,False
122,skills_group_9,Rust,False
123,skills_group_9,Emacs,False


In [29]:
cluster_features = sample_clusters.groupby('cluster_name')['sample_skills'].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     2
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_19    0
skills_group_2     0
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     1
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

In [30]:
#2- create OneHotEncoded skills
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names 

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [31]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

In [32]:
#3- combine features
features = pd.concat([ohe_skills , cluster_features])

In [33]:
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    1
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 145, dtype: int64

In [34]:
#4- predict
predictions = classifier.predict_proba([features.values])
predictions

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


[array([[0.89, 0.11]]),
 array([[0.9, 0.1]]),
 array([[0.92, 0.08]]),
 array([[0.89, 0.11]]),
 array([[0.96, 0.04]]),
 array([[0.89428571, 0.10571429]]),
 array([[0.72, 0.28]]),
 array([[0.78, 0.22]]),
 array([[0.85, 0.15]]),
 array([[0.94, 0.06]]),
 array([[0.91, 0.09]]),
 array([[0.93, 0.07]]),
 array([[0.98, 0.02]]),
 array([[0.78, 0.22]]),
 array([[0.9, 0.1]]),
 array([[0.89571429, 0.10428571]])]

In [35]:
pos_probs = [prob[0][1] for prob in predictions]
pd.Series(pos_probs,index=targets_names).sort_values(ascending=False)

Developer, back-end                              0.280000
Developer, desktop or enterprise applications    0.220000
Engineer, data                                   0.220000
Developer, embedded applications or devices      0.150000
Academic researcher                              0.110000
Database administrator                           0.110000
Developer, QA or test                            0.105714
System administrator                             0.104286
Data or business analyst                         0.100000
Scientist                                        0.100000
Developer, full-stack                            0.090000
Data scientist or machine learning specialist    0.080000
Developer, game or graphics                      0.070000
Developer, front-end                             0.060000
DevOps specialist                                0.040000
Developer, mobile                                0.020000
dtype: float64