In [1]:
DF_PATH = "../data/processed/cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  #'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import mlflow
from mlflow.tracking import MlflowClient
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score
import os
import pickle
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [3]:
def calc_quality(ground_truth , predictions , metric_func , sort_values = False):
    quality_scores ={}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_func(role_truth , role_pred)*100 , 2)

    quality_scores = pd.Series(quality_scores.values() , index = quality_scores.keys())

    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores


In [4]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_18,skills_group_19,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,2,0
3,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,0,0,0,2,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,3,1,2,3,0,1,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,3,0,2,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,6,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,3,1,0,0,1,1
83435,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,4,2,2,4,2,1
83436,0,0,1,0,1,0,0,0,0,0,...,0,0,0,4,5,1,4,0,1,0
83437,0,0,0,0,0,0,1,0,0,0,...,0,0,0,4,1,0,4,0,1,0


In [5]:
roles_df = df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1708
Data or business analyst                          1658
Data scientist or machine learning specialist     2460
Database administrator                            1210
DevOps specialist                                 3056
Developer, QA or test                             1135
Developer, back-end                              17084
Developer, desktop or enterprise applications     4845
Developer, embedded applications or devices       2138
Developer, front-end                              8932
Developer, full-stack                            20655
Developer, game or graphics                        899
Developer, mobile                                 4751
Engineer, data                                    1941
Scientist                                         1046
System administrator                              2069
dtype: int64

In [6]:
#Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

    if len(sub_df) < samples_per_class:
        #Upsample
        sub_df = sub_df.sample(samples_per_class , replace= True , random_state =0)
    
    else:
        #downsample
        sub_df = sub_df.sample(samples_per_class , random_state =0)

    resampled_roles.append(sub_df)

In [7]:
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [8]:
roles_df.sum(axis=0)

Academic researcher                              2280
Data or business analyst                         1965
Data scientist or machine learning specialist    2576
Database administrator                           1765
DevOps specialist                                2170
Developer, QA or test                            1514
Developer, back-end                              5710
Developer, desktop or enterprise applications    2690
Developer, embedded applications or devices      1773
Developer, front-end                             2614
Developer, full-stack                            5602
Developer, game or graphics                      1441
Developer, mobile                                2155
Engineer, data                                   2046
Scientist                                        1910
System administrator                             2110
dtype: int64

In [9]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),df["DevType"],random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),df["DevType"],random_state=0)


### Train models

In [10]:
#intialize mlflow client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
#mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)


  return FileStore(store_uri, store_uri)


### Vanilla Forest

In [11]:
rf_clf = make_pipeline(RobustScaler(),
                    PCA(n_components=0.95),
                    RandomForestClassifier(n_jobs=8 , verbose=1 ,random_state=0))
rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    2.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    6.1s finished


In [12]:
# Evaluate on train set
predictions =  pd.DataFrame(rf_clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calc_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
print(train_scores.mean())
train_scores.sort_values("precision_score")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s


accuracy_score     99.971875
precision_score    99.920625
recall_score       99.819375
f1_score           99.869375
dtype: float64


[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Scientist,99.93,99.65,99.65,99.65
"Developer, QA or test",99.96,99.82,99.64,99.73
Data scientist or machine learning specialist,99.95,99.84,99.79,99.82
Database administrator,99.98,99.85,99.92,99.89
Academic researcher,99.96,99.88,99.77,99.83
"Developer, game or graphics",99.97,99.91,99.63,99.77
Data or business analyst,99.97,99.93,99.8,99.86
"Developer, mobile",99.99,99.94,100.0,99.97
"Developer, desktop or enterprise applications",99.98,99.95,99.9,99.93
"Developer, back-end",99.97,99.98,99.93,99.95


In [13]:
# Evaluate on test set
predictions =  pd.DataFrame(rf_clf.predict(X_test.values),
                            columns=Y_test.columns)
test_scores = {score.__name__: calc_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
print(test_scores.mean())
test_scores.sort_values("precision_score")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


accuracy_score     92.56625
precision_score    94.34750
recall_score       49.55375
f1_score           63.86375
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",82.62,77.65,56.65,65.51
"Developer, back-end",78.71,85.08,34.96,49.56
"Developer, front-end",89.29,90.05,28.8,43.64
Data scientist or machine learning specialist,93.94,90.11,61.88,73.38
"Developer, desktop or enterprise applications",90.85,93.22,34.21,50.06
Academic researcher,94.94,94.08,60.07,73.33
"Developer, mobile",94.69,94.29,55.62,69.96
DevOps specialist,92.75,96.04,36.33,52.72
Data or business analyst,94.52,97.48,47.44,63.82
"Developer, embedded applications or devices",94.62,97.59,38.94,55.67


### Log

In [14]:
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_test.index.tolist(),
                "features_names": X_train.columns.droplevel(0).tolist(),
                "targets_names": Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [15]:
model = {"model_description": "Random Forest: with PCA - Basic",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [16]:
classes_metrics = {"train_scores": train_scores,
                   "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [17]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)

    # Track metrics
    for metric, score in test_scores.mean().items():
        mlflow.log_metric(metric, score)

In [18]:
hpt_rf_clf = make_pipeline(RobustScaler(),
                           PCA(),
                           RandomForestClassifier(n_jobs=8,verbose=1,
random_state=0))

In [19]:
tuned_parameters = [{
    'pca__n_components': [0.7, 0.85, 0.95],
    'randomforestclassifier__n_estimators': [250, 500],
    'randomforestclassifier__max_depth':    [3, 10, None],
}]

In [20]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf, tuned_parameters)
hpt_rf_clf.fit(X_train.values, Y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    1.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    0.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    1.5s
[Parallel(n_jobs=8)]: Done 250 out of 250 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      |

In [21]:
hpt_rf_clf.best_params_

{'pca__n_components': 0.7,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 500}

In [22]:
#Evaluate on train set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_train.values),
                            columns=Y_train.columns)
train_scores = {score.__name__: calc_quality(Y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    2.1s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    2.4s finished


In [23]:
print(train_scores.mean())
train_scores

accuracy_score     99.973125
precision_score    99.940000
recall_score       99.808125
f1_score           99.873750
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.96,99.88,99.77,99.83
Data or business analyst,99.97,99.93,99.8,99.86
Data scientist or machine learning specialist,99.96,100.0,99.69,99.84
Database administrator,99.98,99.92,99.85,99.89
DevOps specialist,99.98,100.0,99.82,99.91
"Developer, QA or test",99.97,99.82,99.73,99.78
"Developer, back-end",99.97,100.0,99.91,99.95
"Developer, desktop or enterprise applications",99.98,99.95,99.9,99.93
"Developer, embedded applications or devices",99.97,100.0,99.71,99.85
"Developer, front-end",99.99,100.0,99.95,99.97


In [24]:
#Evaluate on test set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_test.values),
                            columns=Y_test.columns)
test_scores = {score.__name__: calc_quality(Y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.4s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:    0.5s finished


In [25]:
print(test_scores.mean())
test_scores

accuracy_score     92.735625
precision_score    90.820000
recall_score       52.969375
f1_score           65.990000
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,94.75,90.43,61.15,72.96
Data or business analyst,94.54,93.82,49.69,64.97
Data scientist or machine learning specialist,94.25,86.9,67.59,76.04
Database administrator,94.94,98.07,45.93,62.56
DevOps specialist,93.06,91.02,41.76,57.25
"Developer, QA or test",96.52,99.14,58.23,73.37
"Developer, back-end",79.25,77.5,43.18,55.46
"Developer, desktop or enterprise applications",90.92,88.19,37.17,52.3
"Developer, embedded applications or devices",94.83,91.58,44.47,59.87
"Developer, front-end",89.5,82.35,34.44,48.57


In [26]:
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    Y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [27]:
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [28]:
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [29]:
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in test_scores.mean().items():
        mlflow.log_metric(metric, score) 
    