In [1]:
DF_PATH = "../data/processed/4_balanced_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
import pandas as pd
import numpy as np
import os
import pickle

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.ensemble import RandomForestClassifier

pd.options.display.max_columns=100

___
# Functions

In [3]:
def calculate_quality(ground_truth, predictoins, metric_function, sort_values=False):
    """
    Calculate the quality of your model according to metric score
    instead of giving a summary of overall quality
    Input:
        ground_truth: from real data
        predictions: the predicted from the model
        metric_function: the metric score function used to measure performance
    Output:
        a dict of all scores w.r.t every class (column) separately
    """
    quality_scores = {}
    for col in predictions.columns:
        role_truth = ground_truth[col].copy()
        role_pred = predictions[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
    
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

___
# Prepare Data:
## Read data:

In [4]:
# Read data
df = pd.read_pickle(DF_PATH)

## Split Data to train and test:

In [5]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['DevType'], axis=1),
                                                    df['DevType'],
                                                    random_state=0)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


---
# Train models
## Intialize MLflow

In [6]:
#Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [7]:
#Check if experiment already exists; if not create one
try:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
except:
    print(f'Experiment "{MLFLOW_EXPERIMENT_NAME}" exists at "{mlflow.get_tracking_uri()}"')

Experiment "skills_jobs_stackoverflow" exists at "../models/mlruns"


In [8]:
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
#print(mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME))
print(exp)

<Experiment: artifact_location='../models/mlruns/0', experiment_id='0', lifecycle_stage='active', name='skills_jobs_stackoverflow', tags={}>


## 1. Vanilla Forest

In [9]:
rf_clf = make_pipeline(RobustScaler(),
                       PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=8,
                                              verbose=1,
                                              random_state=0))

rf_clf.fit(X_train.values, y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    9.9s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   22.1s finished


Pipeline(steps=[('robustscaler', RobustScaler()),
                ('pca', PCA(n_components=0.95)),
                ('randomforestclassifier',
                 RandomForestClassifier(n_jobs=8, random_state=0, verbose=1))])

In [77]:
# Evaluate on training set
predictions = pd.DataFrame(rf_clf.predict(X_train.values),
                           columns = y_train.columns)

train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)
mean_train_score = train_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    3.1s finished


In [78]:
print(mean_train_score)
train_scores

accuracy_score     99.958125
precision_score    99.873750
recall_score       99.765625
f1_score           99.818750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.91,99.53,99.72,99.62
Data or business analyst,99.97,99.84,99.89,99.86
Data scientist or machine learning specialist,99.94,99.79,99.79,99.79
Database administrator,99.97,99.75,99.94,99.85
DevOps specialist,99.98,100.0,99.8,99.9
"Developer, QA or test",99.97,100.0,99.64,99.82
"Developer, back-end",99.93,99.94,99.81,99.88
"Developer, desktop or enterprise applications",99.95,99.76,99.88,99.82
"Developer, embedded applications or devices",99.96,100.0,99.52,99.76
"Developer, front-end",99.99,100.0,99.92,99.96


In [79]:
# Evaluate on test set
predictions = pd.DataFrame(rf_clf.predict(X_test.values), 
                                       columns=y_test.columns)

test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    1.2s finished


In [80]:
print(mean_test_scores)
test_scores

accuracy_score     93.666250
precision_score    95.541250
recall_score       57.588125
f1_score           70.990625
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,95.27,96.57,64.21,77.13
Data or business analyst,94.65,97.36,51.55,67.41
Data scientist or machine learning specialist,95.8,93.65,71.75,81.25
Database administrator,97.07,99.49,69.2,81.63
DevOps specialist,93.75,97.23,46.33,62.76
"Developer, QA or test",97.63,99.7,70.0,82.25
"Developer, back-end",82.3,88.66,47.46,61.83
"Developer, desktop or enterprise applications",91.22,96.78,41.21,57.81
"Developer, embedded applications or devices",94.68,95.11,45.26,61.33
"Developer, front-end",91.58,92.51,40.12,55.97



## Prepare Logs

In [81]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "features_names": X_train.columns.droplevel(0).tolist(),
                "targets_names": y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as output_file:
    pickle.dump(data_details, output_file)

In [82]:
# Model
model = {"model_description": "Random Forest: with PCA - Basic",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as output_file:
    pickle.dump(model, output_file)

In [83]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                   "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as output_file:
    pickle.dump(classes_metrics, output_file)

## Logging

In [84]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model['model_description']):
    #Log pickles
    mlflow.log_artifacts(LOG_PATH)
    
    #Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

___
# Hyper parameter tuning:
**Here we use grid search to tune our parameters**

In [56]:
hpt_rf_clf = make_pipeline(RobustScaler(),
                           PCA(),
                           RandomForestClassifier(n_jobs=8,
                                                  verbose=1,
                                                  random_state=0))

In [57]:
# Get all the model patameter to specify which to tune
list(hpt_rf_clf.get_params().keys())

['memory',
 'steps',
 'verbose',
 'robustscaler',
 'pca',
 'randomforestclassifier',
 'robustscaler__copy',
 'robustscaler__quantile_range',
 'robustscaler__unit_variance',
 'robustscaler__with_centering',
 'robustscaler__with_scaling',
 'pca__copy',
 'pca__iterated_power',
 'pca__n_components',
 'pca__random_state',
 'pca__svd_solver',
 'pca__tol',
 'pca__whiten',
 'randomforestclassifier__bootstrap',
 'randomforestclassifier__ccp_alpha',
 'randomforestclassifier__class_weight',
 'randomforestclassifier__criterion',
 'randomforestclassifier__max_depth',
 'randomforestclassifier__max_features',
 'randomforestclassifier__max_leaf_nodes',
 'randomforestclassifier__max_samples',
 'randomforestclassifier__min_impurity_decrease',
 'randomforestclassifier__min_samples_leaf',
 'randomforestclassifier__min_samples_split',
 'randomforestclassifier__min_weight_fraction_leaf',
 'randomforestclassifier__n_estimators',
 'randomforestclassifier__n_jobs',
 'randomforestclassifier__oob_score',
 'rando

In [58]:
# Write down the values to tune on
tuned_parameters = [{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth': [3, 10, None],
}]

In [59]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_parameters)
hpt_rf_clf.fit(X_train.values, y_train.values)

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    8.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   13.6s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:   16.2s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.8s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.4s finished
[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    5.9s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    7.3s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed: 

GridSearchCV(estimator=Pipeline(steps=[('robustscaler', RobustScaler()),
                                       ('pca', PCA()),
                                       ('randomforestclassifier',
                                        RandomForestClassifier(n_jobs=8,
                                                               random_state=0,
                                                               verbose=1))]),
             param_grid=[{'pca__n_components': [0.7, 0.85, 0.95],
                          'randomforestclassifier__max_depth': [3, 10, None],
                          'randomforestclassifier__n_estimators': [250, 500]}])

In [61]:
# Print the tuned model parameters
hpt_rf_clf.best_params_

{'pca__n_components': 0.7,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 250}

## Evaluate the model

In [85]:
# Evaluate on training set
predictions = pd.DataFrame(hpt_rf_clf.predict(X_train.values),
                           columns = y_train.columns)

train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)
mean_train_scores = train_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    5.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    7.4s finished


In [86]:
print(mean_train_scores)
train_scores

accuracy_score     99.960000
precision_score    99.858125
recall_score       99.798750
f1_score           99.827500
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.91,99.53,99.72,99.62
Data or business analyst,99.97,99.89,99.84,99.86
Data scientist or machine learning specialist,99.94,99.79,99.79,99.79
Database administrator,99.97,99.69,100.0,99.85
DevOps specialist,99.98,100.0,99.8,99.9
"Developer, QA or test",99.97,99.86,99.79,99.82
"Developer, back-end",99.93,99.96,99.79,99.88
"Developer, desktop or enterprise applications",99.95,99.88,99.76,99.82
"Developer, embedded applications or devices",99.96,99.88,99.7,99.79
"Developer, front-end",99.99,100.0,99.92,99.96


In [87]:
# Evaluate on test set
predictions = pd.DataFrame(hpt_rf_clf.predict(X_test.values), 
                                       columns=y_test.columns)

test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    2.0s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.7s finished


In [88]:
print(mean_test_scores)
test_scores

accuracy_score     93.77375
precision_score    92.35625
recall_score       60.59625
f1_score           72.47500
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,95.4,94.34,67.02,78.37
Data or business analyst,94.67,93.32,54.19,68.57
Data scientist or machine learning specialist,95.82,89.72,75.69,82.11
Database administrator,97.02,98.74,69.2,81.37
DevOps specialist,94.02,91.3,52.35,66.54
"Developer, QA or test",97.62,99.4,70.0,82.15
"Developer, back-end",82.37,82.39,52.92,64.45
"Developer, desktop or enterprise applications",91.15,94.12,42.01,58.09
"Developer, embedded applications or devices",94.9,92.03,49.55,64.42
"Developer, front-end",91.45,82.1,45.88,58.86


## Prepare logs:

In [89]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [90]:
# Model
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [91]:
# Preformance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

## Logginig:

In [92]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)