In [1]:
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"

TRACKING_URI = "file:///home/deena_gergis/iti/iti_e2e_live/notebooks/mlruns/"
EXPERIMENT_ID = "1"
RUN_ID = "4c35a9c9d26a48e8bfccfae05a63d348"

In [2]:
import os 
import sklearn
import pickle
import yaml

import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient

## Load model 

In [3]:
artifact_path = os.path.join(TRACKING_URI.replace("file://", ""), 
                             EXPERIMENT_ID, 
                             RUN_ID, 
                             'artifacts')

In [4]:
# Load data pkl
data_path  = os.path.join(artifact_path, LOG_DATA_PKL)
with open(data_path, 'rb') as handle:
    data_pkl = pickle.load(handle)

In [5]:
# Load model pkl
model_path = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_path, 'rb') as handle:
    model_pkl = pickle.load(handle)

model = model_pkl["model_object"]

In [6]:
model

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('featureunion',
                 FeatureUnion(transformer_list=[('linear_pca',
                                                 PCA(n_components=40)),
                                                ('kernel_pca',
                                                 KernelPCA(kernel='rbf',
                                                           n_components=40))])),
                ('randomforestclassifier',
                 RandomForestClassifier(random_state=0))])

## Predict sample entry 

In [7]:
CLUSTERS_YAML_PATH = "../data/processed/features_skills_clusters_description.yaml"

In [8]:
CLUSTERS_YAML_PATH

'../data/processed/features_skills_clusters_description.yaml'

In [9]:
with open(CLUSTERS_YAML_PATH, "r") as stream:
    clusters_config = yaml.safe_load(stream)

In [11]:
molten_clusters = [(cluster_name, cluster_skill)
                   for cluster_name, cluster_skills in clusters_config.items()
                   for cluster_skill in cluster_skills]

clusters_df = pd.DataFrame(molten_clusters, columns=["cluster_name", "skill"])

### Recreate cluster features 

In [None]:
sample_skills = ['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']

In [None]:
sample_clusters = clusters_df.copy()

In [None]:
sample_clusters["sample_skills"] = sample_clusters["skill"].isin(sample_skills)

In [None]:
cluster_features = sample_clusters.groupby("cluster_name")["sample_skills"].sum()

In [None]:
cluster_features

cluster_name
skills_group_0     0
skills_group_1     0
skills_group_10    0
skills_group_11    0
skills_group_12    0
skills_group_13    0
skills_group_14    0
skills_group_15    0
skills_group_16    0
skills_group_2     0
skills_group_3     0
skills_group_4     0
skills_group_5     1
skills_group_6     0
skills_group_7     4
skills_group_8     0
skills_group_9     0
Name: sample_skills, dtype: int64

### Create OneHotEncoded skills 

In [None]:
features_names = pd.Series(data_pkl["features_names"])

In [32]:
skills_names = features_names[~features_names.isin(cluster_features.index)]

In [34]:
sample_skills

['Pandas', 'TensorFlow', 'Torch/PyTorch', 'Python', 'Keras']

In [33]:
skills_names

0                  Assembly
1     Bash/Shell/PowerShell
2                         C
3                        C#
4                       C++
              ...          
69                 Teraform
70            Torch/PyTorch
71                 Unity 3D
72            Unreal Engine
73                  Xamarin
Length: 74, dtype: object

In [None]:
skills_names = features_names[~features_names.isin(cluster_features.index)]

In [35]:
ohe_skills = pd.Series(skills_names.isin(sample_skills).astype(int).tolist(), 
                       index=skills_names)

In [36]:
ohe_skills

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
Teraform                 0
Torch/PyTorch            1
Unity 3D                 0
Unreal Engine            0
Xamarin                  0
Length: 74, dtype: int64

### Combine features 

In [37]:
features = pd.concat([ohe_skills, cluster_features])

In [38]:
features = features[data_pkl["features_names"]]

In [39]:
features

Assembly                 0
Bash/Shell/PowerShell    0
C                        0
C#                       0
C++                      0
                        ..
skills_group_5           1
skills_group_6           0
skills_group_7           4
skills_group_8           0
skills_group_9           0
Length: 91, dtype: int64

### Predict

In [40]:
predictions = model.predict_proba([features.values])
positive_probs = [prob[0][1] for prob in predictions]
pd.Series(positive_probs, 
          index=data_pkl["targets_names"]).sort_values(ascending=False)

Data scientist or machine learning specialist    0.935
Scientist                                        0.130
Academic researcher                              0.065
Data or business analyst                         0.030
Engineer, data                                   0.020
Developer, back-end                              0.010
Developer, full-stack                            0.010
Database administrator                           0.000
DevOps specialist                                0.000
Developer, QA or test                            0.000
Developer, desktop or enterprise applications    0.000
Developer, embedded applications or devices      0.000
Developer, front-end                             0.000
Developer, game or graphics                      0.000
Developer, mobile                                0.000
System administrator                             0.000
dtype: float64