In [1]:
DF_PATH = "../data/processed/3_cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

EXPORT_PATH = "../data/processed/4_balanced_data.pkl"

In [2]:
# Load packages
import pandas as pd
import numpy as np

from pathlib import Path
import os
import pickle

import mlflow
from mlflow.tracking import MlflowClient

from sklearn.linear_model import LogisticRegression

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

pd.options.display.max_columns=1000

___
# Functions

In [3]:
def calculate_quality(ground_truth, predictoins, metric_function, sort_values=False):
    """
    Calculate the quality of your model according to metric score
    instead of giving a summary of overall quality
    Input:
        ground_truth: from real data
        predictions: the predicted from the model
        metric_function: the metric score function used to measure performance
    Output:
        a dict of all scores w.r.t every class (column) separately
    """
    quality_scores = {}
    for col in predictions.columns:
        role_truth = ground_truth[col].copy()
        role_pred = predictions[col].copy()
        quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
    
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

---
# Initialize
## Create directories

In [4]:
# Create a directory for the mlflow experiments to track
# We could use os.mkdir() with some functionality but it will raise an error if file exists so we would have to get arround it with if-not condition
# instead we used pathlib.Path as it gives us more functionality than os module
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

## Read data

In [5]:
# Read data
df = pd.read_pickle(DF_PATH)
df.sample(5)

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,LanguageHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,DatabaseHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,PlatformHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,WebframeHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,MiscTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,ToolsTechHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,NEWCollabToolsHaveWorkedWith,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Matlab,Node.js,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SQL,Scala,Swift,TypeScript,VBA,Cassandra,Couchbase,DynamoDB,Elasticsearch,Firebase,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Oracle,PostgreSQL,Redis,SQLite,AWS,DigitalOcean,Google Cloud Platform,Heroku,IBM Cloud or Watson,Microsoft Azure,Oracle Cloud Infrastructure,ASP.NET,ASP.NET Core,Angular,Angular.js,Django,Drupal,Express,FastAPI,Flask,Gatsby,Laravel,React.js,Ruby on Rails,Spring,Svelte,Symfony,Vue.js,jQuery,.NET Core / .NET 5,.NET Framework,Apache Spark,Cordova,Flutter,Hadoop,Keras,NumPy,Pandas,Qt,React Native,TensorFlow,Torch/PyTorch,Ansible,Chef,Deno,Docker,Flow,Git,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Xamarin,Yarn,Android Studio,Atom,Eclipse,Emacs,IPython/Jupyter,IntelliJ,Neovim,NetBeans,Notepad++,PHPStorm,PyCharm,RStudio,Rider,RubyMine,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
51600,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,1,2,2,0,0,0,0,0,1,0,0,0,0,1,2,0,0,0,0,0
64815,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,5,0,1,0,0,0,0,0,3,0,0,0,0,0,0,0,2,3,0,0,0,0
80093,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,3,0,2,0,0
12936,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,2,0,0,0,0,0
54990,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,3,0,0,0,1,0,1,2,1,6,0,0,0,0


---
## Balance classes
**We can see that classes are not balanced so we need to balance them.**

In [6]:
# Check the frequency of roles
roles_df = df['DevType'].copy()
roles_df.sum(axis=0).sort_values()

Developer, game or graphics                        899
Scientist                                         1046
Developer, QA or test                             1135
Database administrator                            1210
Data or business analyst                          1658
Academic researcher                               1708
Engineer, data                                    1941
System administrator                              2069
Developer, embedded applications or devices       2138
Data scientist or machine learning specialist     2460
DevOps specialist                                 3056
Developer, mobile                                 4751
Developer, desktop or enterprise applications     4845
Developer, front-end                              8932
Developer, back-end                              17084
Developer, full-stack                            20655
dtype: int64

**Now we can do resampling to perform over and under-sampling.**

In [7]:
# Resample roles
samples_per_class = 1500
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col]==1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=0)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=0)
    
    resampled_roles.append(sub_df)

**Now merge the resampled_roles; as every single sub_df contains the samples of a specific role with all other roles (rest of columns) equals zero. so merging them will make us a full dataframe almost balanced, and so we will locate these indexes in the df which contains also the skills needed and this way we almost balaced our dataset.**

In [8]:
# Now merge the resampled_roles
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

**Note there would be classes which are more than others; that's because there are participants who works in more than just one role so the same column may be repeared in different roles resulting in some redundency in the smae role**

In [9]:
roles_df.sum().sort_values()

Developer, game or graphics                      1798
Developer, QA or test                            1873
Database administrator                           2193
Developer, embedded applications or devices      2209
Scientist                                        2374
Data or business analyst                         2470
Engineer, data                                   2576
System administrator                             2648
Developer, mobile                                2686
DevOps specialist                                2709
Academic researcher                              2859
Data scientist or machine learning specialist    3176
Developer, front-end                             3267
Developer, desktop or enterprise applications    3405
Developer, full-stack                            6973
Developer, back-end                              7124
dtype: int64

___
## Split into training and testing

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.drop("DevType", axis=1),
                                                    df["DevType"],
                                                    random_state=0)

  obj = obj._drop_axis(labels, axis, level=level, errors=errors)


---
## Train models
### Intialize MLflow

In [11]:
#Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()

In [12]:
#Check if experiment already exists; if not create one
try:
    mlflow.create_experiment(MLFLOW_EXPERIMENT_NAME)
except:
    print(f'Experiment "{MLFLOW_EXPERIMENT_NAME}" exists at "{mlflow.get_tracking_uri()}"')

Experiment "skills_jobs_stackoverflow" exists at "../models/mlruns"


In [13]:
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
#print(mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME))
print(exp)

<Experiment: artifact_location='../models/mlruns/0', experiment_id='0', lifecycle_stage='active', name='skills_jobs_stackoverflow', tags={}>


**Now construct a baseline model which is the simplest model we could have to compare its accuracy with more complicated models and so we can decide whether using complicated models would give us more sophesticated solution or not**
### 1. Logistic regression

In [14]:
clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train.values, y_train.values)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('multioutputclassifier',
                 MultiOutputClassifier(estimator=LogisticRegression()))])

In [15]:
# Evaluate on training set
predictions = pd.DataFrame(clf.predict(X_train.values),
                           columns = y_train.columns)

train_scores = {score.__name__: calculate_quality(y_train, predictions, score)
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores, axis=1)

In [16]:
train_scores

Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,89.58,62.41,28.21,38.85
Data or business analyst,90.83,63.75,22.34,33.09
Data scientist or machine learning specialist,90.51,70.55,50.19,58.65
Database administrator,91.07,54.63,7.25,12.8
DevOps specialist,90.72,69.02,31.87,43.6
"Developer, QA or test",92.17,37.5,0.64,1.26
"Developer, back-end",74.58,61.98,35.84,45.42
"Developer, desktop or enterprise applications",86.91,61.96,17.71,27.55
"Developer, embedded applications or devices",92.31,66.46,32.42,43.58
"Developer, front-end",88.76,65.46,38.02,48.1


In [17]:
# Evaluate on test set
predictions = pd.DataFrame(clf.predict(X_test.values), 
                                       columns=y_test.columns)

test_scores = {score.__name__: calculate_quality(y_test, predictions, score)
               for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores, axis=1)
mean_test_scores = test_scores.mean()

In [18]:
print(mean_test_scores)
test_scores

accuracy_score     88.757500
precision_score    65.678750
recall_score       29.420000
f1_score           37.925625
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,88.88,61.72,27.88,38.41
Data or business analyst,90.27,62.2,23.76,34.38
Data scientist or machine learning specialist,90.58,67.5,49.67,57.23
Database administrator,90.7,54.93,6.9,12.26
DevOps specialist,89.83,61.69,27.86,38.38
"Developer, QA or test",92.2,100.0,0.43,0.85
"Developer, back-end",74.33,63.68,34.93,45.12
"Developer, desktop or enterprise applications",85.72,53.94,14.84,23.28
"Developer, embedded applications or devices",92.02,64.08,32.56,43.18
"Developer, front-end",88.9,63.62,39.12,48.45


---
## Log run
**Now we save logs in mlflow directory**
### 1. Prepare

In [19]:
# Data details
data_details = {"data_path": DF_PATH,
                "training_indices": X_train.index.tolist(),
                "test_indices": X_train.index.tolist(),
                "feature_names": X_train.columns.droplevel(0).tolist(),
                "targets_names": y_train.columns.tolist()}

with open (os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as output_file:
    pickle.dump(data_details, output_file)

In [20]:
# Model 
model = {"model_description": "Baseline model: Logistic Regression",
         "model_details": str(clf),
         "model_object": clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as output_file:
    pickle.dump(model, output_file)

In [21]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                   "test_scores": test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as output_file:
    pickle.dump(classes_metrics, output_file)

### 2. Log

In [22]:
# Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    #log pickle
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

---
# Export data:
**Save the cleaned-balanced dataframe into pickle to furthure use**

In [23]:
df.to_pickle(EXPORT_PATH)