In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageWorkedWith',
                  'DatabaseWorkedWith',
                  'WebframeWorkedWith',
                  'MiscTechWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

## Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values = False):
    """ .)So this function answers: How good is my model for each role separately?
        .)It is doing: Per-label evaluation"""
    quality_scores = {}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred)*100,2)
    quality_scores = pd.Series(quality_scores.values(), index = quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    return quality_scores

## Initialize

### Create Directories

In [4]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok = True)
Path(LOG_PATH).mkdir(parents=True, exist_ok = True)

### read data

In [5]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_17,skills_group_18,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,4,1,0,0,0,2,2
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
5,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,2
7,0,0,0,0,0,0,1,1,0,0,...,0,0,2,0,0,0,0,1,0,1
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64416,0,1,0,0,0,0,0,0,0,0,...,0,0,0,2,2,0,0,0,0,0
64422,0,0,0,0,0,0,1,0,0,0,...,0,0,3,0,0,0,0,0,0,1
64428,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
64447,0,0,0,0,0,0,1,0,0,0,...,0,0,0,3,2,0,0,0,0,2


## Balance classes

In [6]:
# Check the total samples of roles
roles_df = df["DevType"].copy()
roles_df.sum()

Academic researcher                              1039
Data or business analyst                         1059
Data scientist or machine learning specialist    1275
Database administrator                            745
DevOps specialist                                1212
Developer, QA or test                             791
Developer, back-end                              9144
Developer, desktop or enterprise applications    2985
Developer, embedded applications or devices      1193
Developer, front-end                             5177
Developer, full-stack                            8718
Developer, game or graphics                       608
Developer, mobile                                2573
Engineer, data                                    916
Scientist                                         590
System administrator                              880
dtype: int64

In [7]:
# Resample roles
samples_per_class = 700
resampled_roles = []
for role in roles_df:
    sub_df = roles_df.loc[roles_df[role] ==1].copy()
    if len(sub_df) <samples_per_class:
        # UpScale
        sub_df = sub_df.sample(samples_per_class, replace = True, random_state = 0)
    else:
        sub_df = sub_df.sample(samples_per_class, random_state = 0)
    resampled_roles.append(sub_df)

In [8]:
roles_df = pd.concat(resampled_roles)
roles_df

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
36349,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
43006,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
21202,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
18910,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
27724,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47955,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1
39349,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
27277,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
7189,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [9]:
#It ensures X and y still correspond row-by-row
df = df.loc[roles_df.index].copy()
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_17,skills_group_18,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
36349,1,0,1,0,0,0,0,0,0,0,...,0,0,2,0,0,0,3,0,0,0
43006,1,1,0,0,0,0,1,0,0,0,...,0,0,3,0,0,0,3,0,0,0
21202,1,0,0,0,0,0,0,0,0,0,...,0,0,0,3,0,0,0,0,0,0
18910,1,0,1,0,0,0,0,0,0,0,...,0,0,2,0,0,1,4,0,0,1
27724,1,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47955,0,0,0,0,1,0,0,0,0,0,...,0,0,2,0,1,0,0,1,0,0
39349,0,0,0,0,0,0,0,0,0,0,...,0,0,2,1,0,0,0,0,0,4
27277,0,0,0,0,0,0,0,0,0,0,...,0,0,2,2,0,0,2,0,1,3
7189,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1


In [10]:
X_train,X_test,y_train,y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state = 0)

  X_train,X_test,y_train,y_test = train_test_split(df.drop("DevType", axis=1), df["DevType"], random_state = 0)


## Train models

### Initialize MLflow

In [11]:
# Initialize client and experiment
#mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
#client = MlflowClient()
#mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
#exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

if exp is None:
    experiment_id = client.create_experiment(MLFLOW_EXPERIMENT_NAME)
else:
    experiment_id = exp.experiment_id


  return FileStore(store_uri, store_uri)


### 1. Logistic Regression

In [15]:
clf = make_pipeline(StandardScaler(),
                   MultiOutputClassifier(LogisticRegression()))
clf.fit(X_train.values, y_train.values)
predictions = pd.DataFrame(clf.predict(X_train.values), columns = y_train.columns)


#why MultiOutputClassifier: Train one independent Logistic Regression per label (per role) !!!!


In [16]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [17]:
# Evaluate on training set
predictions = pd.DataFrame(clf.predict(X_test.values),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [18]:
print(mean_test_scores)
test_scores

accuracy_score     88.191250
precision_score    58.871875
recall_score       27.813125
f1_score           35.813125
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,87.32,63.11,17.02,26.8
Data or business analyst,90.68,62.31,27.65,38.3
Data scientist or machine learning specialist,92.46,77.54,58.95,66.98
Database administrator,90.07,38.1,2.93,5.44
DevOps specialist,91.5,68.38,32.29,43.87
"Developer, QA or test",90.5,0.0,0.0,0.0
"Developer, back-end",72.14,64.23,44.39,52.5
"Developer, desktop or enterprise applications",83.68,46.59,9.09,15.21
"Developer, embedded applications or devices",89.89,51.33,26.83,35.24
"Developer, front-end",87.25,64.75,43.02,51.69


## Log Run

### 1. Prepare

In [19]:
#data details
data_details = {
    "df_path": DF_PATH,
    "trainig_indices": X_train.index.tolist(),
    "test_indices": X_test.index.tolist(),
    "deature_names": X_train.columns.droplevel(0).tolist(),
    "target_names": y_train.columns.tolist() 
}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details,output_file)

In [20]:
#model
model = {
    "model_description": "Baseline model: Logistic Regression ",
    "model_details": str(clf),
    "model_object":clf
}

with open(os.path.join(LOG_PATH,LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model,output_file)

In [21]:
classes_metics = {
    "rain_scores":train_scores,
    "test_scores":test_scores
}

with open(os.path.join(LOG_PATH,LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metics,output_file)

### 2. Log

In [34]:
# Start a new run and track 
with mlflow.start_run(experiment_id = experiment_id, run_name = model["model_description"]):
    #log pickles
    mlflow.log_artifacts(LOG_PATH)
    #track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)