In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['DatabaseWorkedWith',
                  'LanguageWorkedWith',
                  # 'NEWCollabToolsWorkedWith',
                  'PlatformWorkedWith',
                  'MiscTechWorkedWith',
                  'WebframeWorkedWith']
SYST_COLS      = ['OpSys']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
import random
import plotly 
import os
from pathlib import Path

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

from sklearn.decomposition import PCA, KernelPCA

from sklearn import tree
from sklearn.linear_model import LogisticRegression

from matplotlib import pyplot as plt

_________

### Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

_________

# Initialize

### Create directories

In [4]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

### Read data

In [5]:
# Read Data
df = pd.read_pickle(DF_PATH)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

### Balance classes 

In [6]:
# Check the total samples of roles
roles_df = df["DevType"].copy()
roles_df.sum(axis=0)

Academic researcher                               1430
Data or business analyst                          1410
Data scientist or machine learning specialist     1803
Database administrator                            1027
DevOps specialist                                 1985
Developer, QA or test                             1065
Developer, back-end                              13160
Developer, desktop or enterprise applications     4029
Developer, embedded applications or devices       1544
Developer, front-end                              7300
Developer, full-stack                            13990
Developer, game or graphics                        855
Developer, mobile                                 3633
Engineer, data                                    1344
Scientist                                          775
System administrator                              1311
dtype: int64

In [7]:
# Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0) 
    
    resampled_roles.append(sub_df)

In [8]:
# Construct dfs
roles_df  = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [9]:
roles_df.sum(axis=0)

Academic researcher                              2470
Data or business analyst                         2124
Data scientist or machine learning specialist    2712
Database administrator                           1864
DevOps specialist                                2164
Developer, QA or test                            1617
Developer, back-end                              6826
Developer, desktop or enterprise applications    2960
Developer, embedded applications or devices      1816
Developer, front-end                             3027
Developer, full-stack                            5845
Developer, game or graphics                      1559
Developer, mobile                                2334
Engineer, data                                   2088
Scientist                                        1915
System administrator                             2012
dtype: int64

### Split

In [10]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1), 
                                                    df["DevType"], 
                                                    random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),


____________

# Train models

### Initialize MLflow

In [55]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = mlflow.MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

2022/11/10 23:07:35 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


In [45]:
client.get_experiment_by_
name

<bound method MlflowClient.get_experiment_by_name of <mlflow.tracking.client.MlflowClient object at 0x7fbf66cb7ca0>>

### 1. Logistic regression

In [12]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train.values, Y_train.values)
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)

In [13]:
# Evaluate on training set
predictions =  pd.DataFrame(clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calculate_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

In [14]:
# Evaluate on test set
predictions =  pd.DataFrame(clf.predict(X_test.values), columns=Y_test.columns)
test_scores = {score.__name__: calculate_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
print(mean_test_scores)
test_scores

accuracy_score     88.145000
precision_score    59.436250
recall_score       28.103125
f1_score           36.148750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,88.08,60.12,16.67,26.1
Data or business analyst,90.48,69.1,29.49,41.34
Data scientist or machine learning specialist,90.96,75.89,53.16,62.52
Database administrator,90.23,48.65,3.85,7.13
DevOps specialist,90.79,66.52,29.54,40.91
"Developer, QA or test",91.42,0.0,0.0,0.0
"Developer, back-end",71.42,66.56,45.11,53.77
"Developer, desktop or enterprise applications",83.79,51.32,9.97,16.7
"Developer, embedded applications or devices",92.17,60.47,30.88,40.88
"Developer, front-end",86.67,65.65,38.31,48.39


## Log run

### 1. Prepare

In [16]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [17]:
# Model
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(clf),
         "model_object": clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [18]:
# Performance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

### 2. Log

In [56]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Log pickles 
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 
    