In [1]:
DF_PATH       = "../data/processed/2_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import os 
import pickle

import mlflow
from  mlflow.tracking import MlflowClient


from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.pipeline import make_pipeline


from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import RobustScaler

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Blockchain,Cloud infrastructure engineer,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,Developer Advocate,Developer Experience,"Developer, QA or test",...,skills_group_21,skills_group_22,skills_group_23,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89179,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,3,8,0,1,0,0
89180,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,5,0,1,0,0
89181,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,2,0,0,0,0,0
89182,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,7,3,0,0,0,0


In [4]:
roles_df = df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1279
Blockchain                                         299
Cloud infrastructure engineer                      994
Data or business analyst                           795
Data scientist or machine learning specialist     1528
Database administrator                             243
DevOps specialist                                 1319
Developer Advocate                                 200
Developer Experience                               308
Developer, QA or test                              571
Developer, back-end                              13402
Developer, desktop or enterprise applications     3765
Developer, embedded applications or devices       1807
Developer, front-end                              4974
Developer, full-stack                            24630
Developer, game or graphics                        838
Developer, mobile                                 2519
Engineer, data                                    1195
Hardware E

In [5]:
roles_df.sum(axis=0)
Sample_per_class = 1200
resampled_roles = []

for role in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role] == 1].copy()

    if len(sub_df) < Sample_per_class:
        sub_df = sub_df.sample(Sample_per_class, replace=True, random_state=42)

    else:
        sub_df = sub_df.sample(Sample_per_class, random_state=42)

    resampled_roles.append(sub_df)

roles_df = pd.concat(resampled_roles)

In [6]:
df = df.loc[roles_df.index].copy()
roles_df.sum(axis = 0)


Academic researcher                              1200
Blockchain                                       1200
Cloud infrastructure engineer                    1200
Data or business analyst                         1200
Data scientist or machine learning specialist    1200
Database administrator                           1200
DevOps specialist                                1200
Developer Advocate                               1200
Developer Experience                             1200
Developer, QA or test                            1200
Developer, back-end                              1200
Developer, desktop or enterprise applications    1200
Developer, embedded applications or devices      1200
Developer, front-end                             1200
Developer, full-stack                            1200
Developer, game or graphics                      1200
Developer, mobile                                1200
Engineer, data                                   1200
Hardware Engineer           

In [7]:
X_train, X_test, y_train, y_test = train_test_split( df.drop('DevType' , axis = 1), df['DevType'], test_size=0.33, random_state=42)


  X_train, X_test, y_train, y_test = train_test_split( df.drop('DevType' , axis = 1), df['DevType'], test_size=0.33, random_state=42)


In [8]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

In [9]:
exp.experiment_id

'372234719956380473'

### Random Forest ( Vanilla Model )

In [10]:
rf_clf = make_pipeline(RobustScaler() , 
                    PCA(n_components=0.95) , 
                    RandomForestClassifier(n_jobs=8 , verbose=1 , random_state=42   ))
rf_clf.fit(X_train , y_train)


[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   12.4s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:   45.4s finished


In [11]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [12]:
predictions =  pd.DataFrame(rf_clf.predict(X_train.values),columns=y_train.columns)
# Evaluate on training set
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    3.7s finished


In [13]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,99.98,100.0,99.51,99.75
Blockchain,99.99,99.75,100.0,99.88
Cloud infrastructure engineer,100.0,100.0,100.0,100.0
Data or business analyst,99.96,99.75,99.38,99.57
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Database administrator,99.97,99.26,100.0,99.63
DevOps specialist,99.99,100.0,99.87,99.94
Developer Advocate,100.0,100.0,100.0,100.0
Developer Experience,99.99,99.88,100.0,99.94
"Developer, QA or test",99.99,100.0,99.74,99.87


In [14]:
# Evaluate on test set
predictions =  pd.DataFrame(rf_clf.predict(X_test.values), columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    3.3s finished
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
test_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,95.95,0.0,0.0,0.0
Blockchain,99.71,99.72,93.01,96.25
Cloud infrastructure engineer,98.1,100.0,56.7,72.37
Data or business analyst,98.64,100.0,66.84,80.12
Data scientist or machine learning specialist,96.12,88.89,2.13,4.16
Database administrator,99.81,98.97,96.47,97.7
DevOps specialist,95.6,0.0,0.0,0.0
Developer Advocate,99.94,100.0,98.48,99.23
Developer Experience,99.73,100.0,93.28,96.52
"Developer, QA or test",98.94,100.0,75.72,86.18


In [16]:
mean_train_scores = train_scores.mean()

In [17]:
print('Mean Train Scores')
print(mean_train_scores)
print()
print('Mean Test Scores')
print(mean_test_scores)

Mean Train Scores
accuracy_score     99.988333
precision_score    99.912500
recall_score       99.846250
f1_score           99.879167
dtype: float64

Mean Test Scores
accuracy_score     97.845417
precision_score    80.934167
recall_score       48.849583
f1_score           54.991667
dtype: float64


## There Are  Overfitting !!

##### LOG

In [18]:
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [19]:
# Model
model = {"model_description": "Random Forest: with PCA - Basic",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [20]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                   "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [21]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id,
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)

    # Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Hyper parameter tuning

In [22]:
hpt_rf_clf = make_pipeline(RobustScaler() , PCA() , RandomForestClassifier(n_jobs=8,
                                                  verbose=1,
                                                  random_state=42))

In [23]:
tuned_parameters = [{
    'pca__n_components':[0.5 , 0.7 , 0.85 ],
    'randomforestclassifier__n_estimators':[150 , 750] , 
    'randomforestclassifier__max_depth':[3  , 10 , None]
}]

In [24]:
hpt_rf_clf = GridSearchCV(hpt_rf_clf , tuned_parameters)
hpt_rf_clf.fit(X_train.values , y_train.values)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    3.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.8s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    3.9s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 150 out of 150 | elapsed:    0.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_job

In [25]:
print('Done')

Done


In [26]:
hpt_rf_clf.best_params_


{'pca__n_components': 0.5,
 'randomforestclassifier__max_depth': None,
 'randomforestclassifier__n_estimators': 750}

In [27]:
# Evaluate on train set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_train.values),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    5.6s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   13.4s
[Parallel(n_jobs=8)]: Done 750 out of 750 | elapsed:   22.0s finished


In [28]:
# Evaluate on test set
predictions =  pd.DataFrame(hpt_rf_clf.predict(X_test.values),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   13.4s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   15.5s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   18.3s
[Parallel(n_jobs=8)]: Done 750 out of 750 | elapsed:   21.5s finished


In [29]:
print(test_scores.mean())
test_scores

accuracy_score     97.906667
precision_score    86.920417
recall_score       51.379167
f1_score           58.047500
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,96.01,66.67,1.57,3.07
Blockchain,99.71,100.0,92.75,96.24
Cloud infrastructure engineer,98.09,98.76,57.18,72.42
Data or business analyst,98.62,98.86,67.1,79.94
Data scientist or machine learning specialist,96.43,72.29,15.96,26.14
Database administrator,99.8,98.96,96.22,97.57
DevOps specialist,95.61,100.0,0.24,0.48
Developer Advocate,99.92,99.49,98.48,98.98
Developer Experience,99.73,100.0,93.28,96.52
"Developer, QA or test",98.97,100.0,76.44,86.65


In [30]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices":     X_test.index.tolist(), 
                "features_names":   X_train.columns.droplevel(0).tolist(),
                "targets_names":    y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [31]:
# Model
model = {"model_description": "Random Forest: with PCA + Hyperparamter tuning",
         "model_details": str(hpt_rf_clf),
         "model_object": hpt_rf_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [32]:
# Preformance details
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [None]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id, 
                      run_name=model["model_description"]):
    # Log pickles
    mlflow.log_artifacts(LOG_PATH)  
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score) 

In [None]:
mean_train_scores = train_scores.mean()


In [None]:
print('Mean Train Scores')
print(mean_train_scores)
print()
print('Mean Test Scores')
print(mean_test_scores)