In [26]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"
MLFLOW_RUN_ID = "a107782f5e814648af1d609b046aa929"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [3]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

# Initialize

# 1. Mlflow

In [21]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = run.info.artifact_uri

## Load model

In [22]:
# Load model
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

{'model_description': 'Random Forest: with PCA - Basic',
 'model_details': "Pipeline(steps=[('robustscaler', RobustScaler()),\n                ('pca', PCA(n_components=0.95)),\n                ('randomforestclassifier',\n                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])",
 'model_object': Pipeline(steps=[('robustscaler', RobustScaler()),
                 ('pca', PCA(n_components=0.95)),
                 ('randomforestclassifier',
                  RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])}

In [25]:
# Load data pkl
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

dict_keys(['data_path', 'training_indices', 'test_indices', 'features_names', 'targets_names'])

In [68]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

______

## Load skills Clusters

In [70]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

clusters_config

{'skills_group_0': ['Groovy',
  'Java',
  'Oracle',
  'Oracle Cloud Infrastructure',
  'Spring',
  'Atom',
  'Eclipse',
  'IntelliJ',
  'NetBeans',
  'Sublime Text'],
 'skills_group_1': ['Matlab',
  'Python',
  'Django',
  'FastAPI',
  'Flask',
  'NumPy',
  'Pandas',
  'IPython/Jupyter',
  'PyCharm'],
 'skills_group_10': ['Dart',
  'Kotlin',
  'Firebase',
  'SQLite',
  'Google Cloud Platform',
  'Flutter',
  'Android Studio'],
 'skills_group_11': ['Assembly', 'C', 'C++', 'Qt'],
 'skills_group_12': ['Scala', 'Apache Spark', 'Hadoop'],
 'skills_group_13': ['HTML/CSS', 'JavaScript', 'jQuery', 'Visual Studio Code'],
 'skills_group_14': ['Bash/Shell', 'Perl', 'Vim'],
 'skills_group_15': ['Julia', 'R', 'RStudio'],
 'skills_group_16': ['Unity 3D', 'Unreal Engine'],
 'skills_group_17': ['Objective-C', 'Swift', 'Xcode'],
 'skills_group_18': ['Svelte', 'Deno'],
 'skills_group_19': ['TypeScript', 'Angular', 'Angular.js', 'Cordova'],
 'skills_group_2': ['C#',
  'F#',
  'PowerShell',
  'VBA',
  'Mi

In [71]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

Unnamed: 0,cluster_name,skill
0,skills_group_0,Groovy
1,skills_group_0,Java
2,skills_group_0,Oracle
3,skills_group_0,Oracle Cloud Infrastructure
4,skills_group_0,Spring
...,...,...
120,skills_group_9,PostgreSQL
121,skills_group_9,AWS
122,skills_group_9,Docker
123,skills_group_9,Git


________

## Predict sample entry

In [39]:
sample_skills = ['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']

In [47]:
# Verify
pd.Series(sample_skills).isin(features_names)

0    True
1    True
2    True
3    True
4    True
dtype: bool

### 1. Recreate cluster features

In [43]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

Unnamed: 0,cluster_name,skill,sample_skills
0,skills_group_0,Groovy,False
1,skills_group_0,Java,False
2,skills_group_0,Oracle,False
3,skills_group_0,Oracle Cloud Infrastructure,False
4,skills_group_0,Spring,False
...,...,...,...
120,skills_group_9,PostgreSQL,False
121,skills_group_9,AWS,False
122,skills_group_9,Docker,False
123,skills_group_9,Git,False


In [44]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

cluster_name
skills_group_0     0
skills_group_1     2
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_17    0
skills_group_18    0
skills_group_19    0
skills_group_2     0
skills_group_20    0
skills_group_21    0
skills_group_22    3
skills_group_3     0
skills_group_4     0
skills_group_5     0
skills_group_6     0
skills_group_7     0
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

### 2. Create OneHotEncoded skills

In [51]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

0                     APL
1                Assembly
2              Bash/Shell
3                       C
4                      C#
              ...        
120                   Vim
121         Visual Studio
122    Visual Studio Code
123              Webstorm
124                 Xcode
Length: 125, dtype: object

In [55]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

APL                   0
Assembly              0
Bash/Shell            0
C                     0
C#                    0
                     ..
Vim                   0
Visual Studio         0
Visual Studio Code    0
Webstorm              0
Xcode                 0
Length: 125, dtype: int64

### 3. Combine features

In [61]:
# Concat
features = pd.concat([ohe_skills,
                      cluster_features])

In [62]:
# Sort columns
features = features.loc[features_names]
features

APL               0
Assembly          0
Bash/Shell        0
C                 0
C#                0
                 ..
skills_group_5    0
skills_group_6    0
skills_group_7    0
skills_group_8    0
skills_group_9    0
Length: 148, dtype: int64

### 4. Predict

In [69]:
predictions = classifier.predict_proba([features.values])
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


Data scientist or machine learning specialist    0.70
Academic researcher                              0.24
Scientist                                        0.19
Engineer, data                                   0.13
Data or business analyst                         0.12
Developer, back-end                              0.12
Developer, game or graphics                      0.04
Developer, embedded applications or devices      0.03
Developer, desktop or enterprise applications    0.02
Database administrator                           0.01
Developer, QA or test                            0.01
Developer, front-end                             0.01
Developer, full-stack                            0.01
DevOps specialist                                0.00
Developer, mobile                                0.00
System administrator                             0.00
dtype: float64