In [1]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_RUN_ID = "a107782f5e814648af1d609b046aa929"

LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [2]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

# Initialize

# 1. Mlflow

In [3]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

run = mlflow.get_run(MLFLOW_RUN_ID)
artificats_path = run.info.artifact_uri

MlflowException: Run 'a107782f5e814648af1d609b046aa929' not found

## Load model

In [None]:
# Load model
model_path = os.path.join(artificats_path, LOG_MODEL_PKL)
with open(model_path, "rb") as f:
    model = pickle.load(f)

model

In [4]:
# Load data pkl
data_path  = os.path.join(artificats_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data = pickle.load(handle)

data.keys()

NameError: name 'artificats_path' is not defined

In [None]:
# Unpack vars
features_names = pd.Series(data["features_names"])
targets_names  = pd.Series(data['targets_names'])
classifier = model['model_object']

In [None]:
classifier

______

## Load skills Clusters

In [None]:
# Load skills clusters
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

clusters_config

In [None]:
# Reformat into data frame
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])
clusters_df

________

## Predict sample entry

In [None]:
sample_skills = ['Scala', 'Hadoop', 'Python']

In [None]:
# Verify
pd.Series(sample_skills).isin(features_names)

### 1. Recreate cluster features

In [None]:
sample_clusters = clusters_df.copy()
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)
sample_clusters

In [None]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()
cluster_features

### 2. Create OneHotEncoded skills

In [None]:
skills_names = features_names[~features_names.isin(cluster_features.index)]
skills_names

In [None]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)
ohe_skills

### 3. Combine features

In [None]:
# Concat
features = pd.concat([ohe_skills,
                      cluster_features])

In [None]:
# Sort columns
features = features.loc[features_names]
features

### 4. Predict

In [None]:
predictions = classifier.predict_proba([features.values])
predictions

In [None]:
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)