In [33]:
# Constants
DATA_PATH   = "../Data/Processed/3_engineered_df.pkl"

TECH_JOBS = ['Techjobs']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "tech_jobs_predictions"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import logging
import pickle
from pathlib import Path
import os

import mlflow
from mlflow.tracking import MlflowClient
 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler


from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score,ConfusionMatrixDisplay,classification_report
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate,cross_val_predict, GridSearchCV, RandomizedSearchCV

from scipy.stats import randint as sp_randint

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_____

### Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        if metric_function == accuracy_score:
            quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        else: 
            quality_scores[col] = round(metric_function(role_truth, role_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [4]:
def calculate_scores(clf, x, y):
    y_pred = clf.predict(x)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro',zero_division=0.0)
    recall = recall_score(y, y_pred, average='macro',zero_division=0.0)
    f1 = f1_score(y, y_pred, average='macro',zero_division=0.0)
    
    return {'accuracy': accuracy,
            'precision':precision,
            'recall': recall,
           'f1' : f1}

In [5]:
def f1_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(f1_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

In [6]:
def precision_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(precision_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

_____

In [7]:
# Load dataset and make a copy
eng_df = pd.read_pickle(DATA_PATH)
df = eng_df.copy()

In [8]:
df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_25,skills_group_26,skills_group_27,skills_group_28,skills_group_29,skills_group_3,skills_group_30,skills_group_31,skills_group_32,skills_group_33,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,2,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
10,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,4,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,7,4,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,3,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0
15,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,1,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,5,0,0,1,0,0,0,0,0,0,0,0,0,2,0,0,1,0,3,0,0,0,3,1,1,0,0,0,0,0,3,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73261,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,1,0,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,2,0,6,0,0,5,0,1,0,0,0,0,0,0,0,0,1,0,0,0,2,3,0,0,0,0,1,2,2,1,0,0,3,0,1,0,0
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,2,0,4,1,0,3,0,1,0,0,0,0,0,0,0,2,2,0,1,0,2,0,0,0,0,0,2,0,3,0,1,0,1,0,2,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,9,0,1,4,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,2,0,1,0,0


## Create a Test Set

In [9]:
X = df.drop(df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [10]:
# We will split using traditional train_test_split because we are dealing with multilabel data
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y,test_size=0.20,
                                                    random_state=42)

## Deal with Imbalanced Dataset

- **To deal with imbalance, I tried different methods to see which one performs the best**
    - **Didn't apply any modifications to the Dataset both as multilabel or as multiclass.**
    - **SMOTE by converting the Dataset to multiclass insted of mutilabel by taking rows that have only 1 value.**
    - **Using random sample method by pandas that returns random samples to try to balance the dataset.** `Best Performer`

### 1-Random Sample

In [11]:
y_train.sum(axis=0).sort_values()

Blockchain                                         288
Security professional                              293
Database administrator                             328
Scientist                                          437
Developer, game or graphics                        439
Developer, QA or test                              542
System administrator                               584
Data or business analyst                           644
Engineer, data                                     824
Academic researcher                                870
Cloud infrastructure engineer                      883
Data scientist or machine learning specialist     1034
Developer, embedded applications or devices       1051
DevOps specialist                                 1104
Developer, desktop or enterprise applications     2198
Developer, mobile                                 2391
Developer, front-end                              4001
Developer, back-end                               8733
Developer,

In [12]:
# DownSample majority classes and OverSample minority Classes of training set
samples_per_class = 600
resampled_jobs = []

for job in y_train.columns:
    sub_df = y_train.loc[y_train[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [13]:
y_train = pd.concat(resampled_jobs)
x_train = x_train.loc[y_train.index].copy()
y_train.sum(axis=0).sort_values()

Blockchain                                        542
Developer, game or graphics                       549
Database administrator                            553
Developer, QA or test                             554
Security professional                             558
System administrator                              629
Developer, embedded applications or devices       659
Engineer, data                                    667
Scientist                                         674
Data or business analyst                          690
Cloud infrastructure engineer                     693
Developer, mobile                                 700
DevOps specialist                                 770
Data scientist or machine learning specialist     785
Developer, front-end                              818
Academic researcher                               836
Developer, desktop or enterprise applications     881
Developer, full-stack                            1876
Developer, back-end         

In [15]:
# DownSample majority classes and OverSample minority Classes of training set
samples_per_class = 250
resampled_jobs = []

for job in y_test.columns:
    sub_df = y_test.loc[y_test[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [16]:
y_test = pd.concat(resampled_jobs)
x_test = x_test.loc[y_test.index].copy()
y_test.sum(axis=0).sort_values()

Blockchain                                        259
Database administrator                            271
Developer, game or graphics                       276
Security professional                             276
Developer, QA or test                             291
System administrator                              304
Data or business analyst                          311
Engineer, data                                    318
Developer, embedded applications or devices       326
Cloud infrastructure engineer                     331
Developer, mobile                                 345
Scientist                                         351
DevOps specialist                                 364
Data scientist or machine learning specialist     389
Academic researcher                               418
Developer, front-end                              425
Developer, desktop or enterprise applications     443
Developer, full-stack                             962
Developer, back-end         

### 2- MultiClass with SMOTE

### Intialize MLflow

In [17]:
# Create Directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [18]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

## Create a Baseline Model

In [19]:
# Create a Logistic Regression Classifier, Used StandardScaler because logistic Regression uses l2 regression by default
log_clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))
log_clf.fit(x_train, y_train)

### Evaluating the multilabel dataset

In [20]:
#create a dataframe contains the predictions to evaluate the training set
predictions =  pd.DataFrame(log_clf.predict(x_train),
                            columns=y_train.columns)

#create a dict for each type of score contains a dicts if each label and its value 
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [21]:
print(mean_train_scores)
train_scores

accuracy_score     92.950000
precision_score    80.687895
recall_score       65.194211
f1_score           68.856316
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Data scientist or machine learning specialist,94.11,82.54,74.65,77.91
"Engineer, data",94.05,82.32,62.98,67.83
Data or business analyst,93.58,79.67,62.08,66.47
"Developer, back-end",81.94,74.69,63.56,65.96
Database administrator,93.94,66.45,53.09,54.26
"Developer, mobile",96.62,90.83,82.79,86.29
"Developer, full-stack",82.81,73.17,65.82,68.1
Cloud infrastructure engineer,93.66,79.17,64.68,69.05
"Developer, embedded applications or devices",94.67,83.09,69.33,74.08
"Developer, QA or test",94.24,81.36,51.14,50.78


In [22]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(log_clf.predict(x_test),
                            columns=y_test.columns)

#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [23]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     92.110000
precision_score    73.057368
recall_score       62.482632
f1_score           65.090000
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Database administrator,93.64,56.4,51.21,51.17
Security professional,93.62,62.49,53.27,54.43
"Developer, QA or test",93.83,63.62,50.3,49.08
Cloud infrastructure engineer,92.59,69.22,60.52,63.26
DevOps specialist,92.0,69.48,59.39,62.17
"Developer, full-stack",81.09,70.08,64.26,66.08
"Developer, desktop or enterprise applications",90.8,72.1,56.85,59.29
System administrator,93.6,72.16,55.98,58.54
"Developer, back-end",81.09,72.7,62.29,64.38
Data or business analyst,93.58,73.48,61.28,64.8


### Evaluating the multiclass Dataset

### Log Baseline Model

In [24]:
# Save the model's dataset trained on
data_details = {
#For multilabel Dataset
                "data_path": DATA_PATH,
                "training_set": x_train.index.tolist(),
                "test_indices":     x_test.index.tolist(), 
                "features_names":   x_train.columns.tolist(),
                "targets_names":    y_train.columns.tolist()
#For multiclass Dataset    
                # "x_train": x_train,
                # "x_test":x_test,
                # "y_train":y_train,
                # "y_test": y_test
}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [25]:
# save the model, model details and model's description
model = {"model_description": "Baseline model: Logistic Regression, multilabel, Data Resampled ",
         "model_details": str(log_clf),
         "model_object": log_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [26]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                    "test_scores" : test_scores} 


with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [27]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Random Forest Model

In [28]:
#Create a random forest classifier
rf_clf = make_pipeline(#StandardScaler(),
                       #PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=-1,
                                              verbose=1,
                                              random_state=42))

rf_clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.2s finished


### Evaluating the multilabel dataset

In [29]:
#create a dataframe contains the predictions to evaluate the training set
predictions =  pd.DataFrame(rf_clf.predict(x_train),
                            columns=y_train.columns)

#create a dict for each type of score contains a dicts if each label and its value 
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


In [30]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     99.996316
precision_score    99.996316
recall_score       99.974737
f1_score           99.986316
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Engineer, data",99.99,99.99,99.93,99.96
Data or business analyst,99.99,99.99,99.93,99.96
"Developer, desktop or enterprise applications",99.99,99.99,99.94,99.97
"Developer, mobile",99.99,99.99,99.93,99.96
"Developer, front-end",99.99,99.99,99.94,99.97
Cloud infrastructure engineer,99.99,99.99,99.93,99.96
System administrator,99.99,99.99,99.92,99.96
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,100.0,100.0,100.0,100.0
"Developer, game or graphics",100.0,100.0,100.0,100.0


In [31]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(rf_clf.predict(x_test),
                            columns=y_test.columns)
#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished


In [32]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     92.186316
precision_score    71.454211
recall_score       55.697368
f1_score           57.233684
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
System administrator,93.56,46.8,49.98,48.34
"Developer, QA or test",93.87,46.94,50.0,48.42
Security professional,94.17,47.09,49.99,48.5
Database administrator,94.27,47.15,49.99,48.53
Blockchain,94.55,47.27,50.0,48.6
Cloud infrastructure engineer,92.8,64.19,51.69,51.66
Data or business analyst,93.41,70.4,52.37,52.93
"Developer, full-stack",82.23,74.78,61.02,63.15
"Developer, embedded applications or devices",93.62,78.43,58.36,62.0
Data scientist or machine learning specialist,92.95,80.46,63.03,67.51


### Evaluating the multiclass Dataset

### Log RandomForest Model

In [103]:
# save the model, model details and model's description
model = {"model_description": "Random Forest, multilabel, Data resampled",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [104]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [105]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Decision Tree

In [106]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier(random_state=42)

dec_clf.fit(x_train,y_train)

### Evaluating the multilabel dataset

In [107]:
# Evaluate on train set
predictions =  pd.DataFrame(dec_clf.predict(x_train),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [108]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     99.997368
precision_score    99.995263
recall_score       99.990526
f1_score           99.992105
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Security professional,99.99,99.93,100.0,99.96
"Developer, desktop or enterprise applications",99.99,99.99,99.97,99.98
"Developer, back-end",99.99,99.99,99.99,99.99
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,99.99,100.0,99.93,99.96
"Developer, front-end",100.0,100.0,100.0,100.0
"Developer, game or graphics",99.99,100.0,99.93,99.96
Scientist,100.0,100.0,100.0,100.0
System administrator,100.0,100.0,100.0,100.0
"Developer, QA or test",100.0,100.0,100.0,100.0


In [109]:
# Evaluate on test set
predictions =  pd.DataFrame(dec_clf.predict(x_test),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [110]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     85.354211
precision_score    60.556316
recall_score       60.264737
f1_score           60.346842
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Security professional,89.49,50.81,50.53,50.49
"Developer, QA or test",87.96,52.09,51.92,52.0
System administrator,84.84,54.33,53.97,54.13
Database administrator,87.46,54.75,54.0,54.31
"Developer, desktop or enterprise applications",81.7,56.62,55.5,55.93
"Engineer, data",86.16,56.88,56.16,56.48
"Developer, back-end",64.75,57.47,58.26,57.66
Data or business analyst,87.23,58.34,57.78,58.04
DevOps specialist,85.11,60.7,60.19,60.43
Cloud infrastructure engineer,85.81,61.18,60.55,60.85


### Log Decision Tree

In [111]:
# Model
model = {"model_description": "Decision Tree, multilabel, Data resampled",
         "model_details": str(dec_clf),
         "model_object": dec_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [112]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [113]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

### Evaluating the multiclass Dataset

## Gradient Boost

In [114]:
base_estimator = GradientBoostingClassifier(random_state=42)

gd_clf = OneVsRestClassifier(base_estimator)
gd_clf.fit(x_train,y_train)

### Evaluating the multilabal Dataset

In [115]:
# Evaluate on train set
predictions =  pd.DataFrame(gd_clf.predict(x_train),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [116]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     91.822105
precision_score    87.011053
recall_score       66.669474
f1_score           70.760000
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",77.23,75.63,65.77,67.39
"Developer, full-stack",82.31,78.43,75.02,76.38
"Developer, desktop or enterprise applications",89.68,83.05,63.0,67.23
DevOps specialist,91.38,84.21,65.54,70.37
Cloud infrastructure engineer,92.41,84.65,66.68,71.73
Data scientist or machine learning specialist,93.41,84.67,78.57,81.22
"Developer, front-end",91.16,85.1,67.41,72.33
Scientist,93.87,85.78,69.93,75.11
Data or business analyst,93.24,85.87,64.57,69.84
Academic researcher,92.75,86.11,71.39,76.32


In [117]:
# Evaluate on test set
predictions =  pd.DataFrame(gd_clf.predict(x_test),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [118]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     90.427368
precision_score    75.041053
recall_score       62.416842
f1_score           64.994737
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",92.86,46.49,49.93,48.15
Security professional,93.05,61.58,50.32,48.95
Database administrator,91.77,67.01,51.71,51.41
"Developer, back-end",76.18,70.09,61.77,62.97
"Developer, full-stack",78.63,72.78,70.53,71.46
Data or business analyst,91.65,73.67,58.35,61.39
"Developer, desktop or enterprise applications",87.93,75.18,58.97,61.67
Cloud infrastructure engineer,90.6,76.1,63.3,66.93
DevOps specialist,90.3,76.98,61.96,65.63
Data scientist or machine learning specialist,91.09,77.03,73.04,74.81


### Log Gradient Boost

In [119]:
# Model
model = {"model_description": "Gradient Boost, multilabel, Data resampled",
         "model_details": str(gd_clf),
         "model_object": gd_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [120]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [121]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Cat Boost

In [122]:
cat_clf = CatBoostClassifier(loss_function='MultiLogloss',
                            eval_metric='HammingLoss',
                            verbose=1)
cat_clf.fit(x_train, y_train)

Learning rate set to 0.029123
0:	learn: 0.1101801	total: 143ms	remaining: 2m 22s
1:	learn: 0.1110388	total: 292ms	remaining: 2m 25s
2:	learn: 0.1110295	total: 429ms	remaining: 2m 22s
3:	learn: 0.1111404	total: 578ms	remaining: 2m 24s
4:	learn: 0.1111911	total: 724ms	remaining: 2m 24s
5:	learn: 0.1112004	total: 873ms	remaining: 2m 24s
6:	learn: 0.1111357	total: 1.03s	remaining: 2m 26s
7:	learn: 0.1111450	total: 1.19s	remaining: 2m 27s
8:	learn: 0.1112050	total: 1.34s	remaining: 2m 27s
9:	learn: 0.1111958	total: 1.52s	remaining: 2m 31s
10:	learn: 0.1111773	total: 1.68s	remaining: 2m 30s
11:	learn: 0.1111588	total: 1.85s	remaining: 2m 32s
12:	learn: 0.1111588	total: 2.03s	remaining: 2m 34s
13:	learn: 0.1111404	total: 2.19s	remaining: 2m 34s
14:	learn: 0.1111127	total: 2.35s	remaining: 2m 34s
15:	learn: 0.1111404	total: 2.49s	remaining: 2m 33s
16:	learn: 0.1111357	total: 2.63s	remaining: 2m 32s
17:	learn: 0.1111450	total: 2.78s	remaining: 2m 31s
18:	learn: 0.1111542	total: 2.97s	remaining:

<catboost.core.CatBoostClassifier at 0x2af2fd1ed10>

In [123]:
#create a dataframe contains the predictions to evaluate the training set
predictions =  pd.DataFrame(cat_clf.predict(x_train),
                            columns=y_train.columns)

#create a dict for each type of score contains a dicts if each label and its value 
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [124]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     94.313158
precision_score    93.374211
recall_score       75.748421
f1_score           80.946316
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",85.43,86.07,78.15,80.69
"Developer, full-stack",88.75,87.25,83.56,85.13
"Developer, desktop or enterprise applications",91.89,90.73,70.1,75.83
"Developer, front-end",93.52,91.01,76.03,81.31
Data scientist or machine learning specialist,95.83,91.82,85.41,88.27
Scientist,95.75,92.18,78.87,84.01
Academic researcher,95.03,92.73,79.67,84.69
"Developer, embedded applications or devices",95.51,92.9,75.98,81.95
DevOps specialist,94.28,92.98,76.41,82.16
Cloud infrastructure engineer,95.24,93.36,78.53,84.04


In [125]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(cat_clf.predict(x_test),
                            columns=y_test.columns)
#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [126]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     90.594737
precision_score    75.947895
recall_score       61.707368
f1_score           64.355263
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",92.96,46.49,49.99,48.18
Security professional,93.02,46.56,49.94,48.19
Database administrator,91.86,68.23,50.77,49.54
"Developer, back-end",77.23,71.42,64.92,66.48
"Developer, full-stack",78.67,72.82,70.62,71.53
Cloud infrastructure engineer,90.63,76.38,63.17,66.84
Data or business analyst,91.93,76.93,58.14,61.36
DevOps specialist,90.49,77.99,62.64,66.49
"Developer, desktop or enterprise applications",88.25,79.02,58.18,60.78
Data scientist or machine learning specialist,91.56,79.31,71.42,74.53


### Log Cat Boost

In [127]:
# Model
model = {"model_description": "Cat Boost, multilabel, Data resampled",
         "model_details": str(cat_clf),
         "model_object": cat_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [128]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [129]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Hyperparameter Tuning

In [130]:
list(rf_clf.get_params().keys())

['memory',
 'steps',
 'verbose',
 'randomforestclassifier',
 'randomforestclassifier__bootstrap',
 'randomforestclassifier__ccp_alpha',
 'randomforestclassifier__class_weight',
 'randomforestclassifier__criterion',
 'randomforestclassifier__max_depth',
 'randomforestclassifier__max_features',
 'randomforestclassifier__max_leaf_nodes',
 'randomforestclassifier__max_samples',
 'randomforestclassifier__min_impurity_decrease',
 'randomforestclassifier__min_samples_leaf',
 'randomforestclassifier__min_samples_split',
 'randomforestclassifier__min_weight_fraction_leaf',
 'randomforestclassifier__n_estimators',
 'randomforestclassifier__n_jobs',
 'randomforestclassifier__oob_score',
 'randomforestclassifier__random_state',
 'randomforestclassifier__verbose',
 'randomforestclassifier__warm_start']

In [17]:
    # {'randomforestclassifier__n_estimators':[80,100,120],
    #  'randomforestclassifier__max_depth':[30,40,45,50],
    #  'randomforestclassifier__min_samples_split':[2,4,6],
    #  'randomforestclassifier__min_samples_leaf':[1,3,5],
    #  'randomforestclassifier__class_weight':['balanced',None],
    #  'randomforestclassifier__max_features':['sqrt',None],
    #  'randomforestclassifier__verbose': [0],
rf_param_grid = [
    {'randomforestclassifier__n_estimators':[100],
     'randomforestclassifier__max_depth':[45],
     'randomforestclassifier__min_samples_split':[2],
     # 'randomforestclassifier__min_samples_leaf':[1,3,5],
     'randomforestclassifier__class_weight':['balanced'],
     'randomforestclassifier__max_features':['sqrt'],
     'randomforestclassifier__verbose': [0],
    }]
    

rf_clf_grid_search = GridSearchCV(rf_clf, rf_param_grid,cv= 3, scoring=precision_score_multi_label, return_train_score=True,refit=True,verbose=2)

rf_clf_grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 1 candidates, totalling 3 fits
[CV] END randomforestclassifier__class_weight=balanced, randomforestclassifier__max_depth=45, randomforestclassifier__max_features=sqrt, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; total time=   5.8s
[CV] END randomforestclassifier__class_weight=balanced, randomforestclassifier__max_depth=45, randomforestclassifier__max_features=sqrt, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; total time=   2.8s
[CV] END randomforestclassifier__class_weight=balanced, randomforestclassifier__max_depth=45, randomforestclassifier__max_features=sqrt, randomforestclassifier__min_samples_split=2, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; total time=   2.7s


In [133]:
rf_clf_grid_search.best_params_

{'randomforestclassifier__class_weight': 'balanced',
 'randomforestclassifier__max_depth': 45,
 'randomforestclassifier__max_features': 'sqrt',
 'randomforestclassifier__min_samples_split': 2,
 'randomforestclassifier__n_estimators': 100,
 'randomforestclassifier__verbose': 0}

In [18]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(rf_clf_grid_search.best_estimator_.predict(x_test),
                            columns=y_test.columns)
#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [19]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     89.995789
precision_score    74.420000
recall_score       55.937368
f1_score           57.100000
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
System administrator,90.42,45.23,49.97,47.48
"Developer, QA or test",92.98,46.49,50.0,48.18
Security professional,93.09,46.56,49.98,48.21
Blockchain,93.58,46.79,50.0,48.34
"Developer, back-end",75.04,69.18,57.34,56.94
"Developer, full-stack",77.56,72.49,64.1,65.73
Data or business analyst,91.46,72.97,52.44,52.58
Cloud infrastructure engineer,89.93,76.48,53.69,54.36
Database administrator,91.91,79.3,50.21,48.32
"Developer, embedded applications or devices",92.91,80.36,57.03,60.17


### Log RandomForest hyperparameters

In [20]:
# save the model, model details and model's description
model = {"model_description": "Random Forest, Tuned, multilabel, Data resampled",
         "model_details": str(rf_clf_grid_search.best_estimator_),
         "model_object": rf_clf_grid_search.best_estimator_}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [24]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [25]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Voting Classifier

In [47]:
log_clf = make_pipeline(StandardScaler(),
                    LogisticRegression(max_iter=1000))
gd_clf = GradientBoostingClassifier(random_state=42)

rf_clf = RandomForestClassifier(max_depth=45,n_jobs=-1,
                                max_features= 'sqrt',min_samples_split= 2,
                                n_estimators = 100,verbose=0,
                                random_state=42)
# cat_clf = CatBoostClassifier(loss_function='MultiLogloss',
#                             eval_metric='HammingLoss',
#                             verbose=1)

voting_clf = MultiOutputClassifier(VotingClassifier(
    estimators=[
        ('log', OneVsRestClassifier(log_clf)),
        ('rf', rf_clf),
        ('gd', OneVsRestClassifier(gd_clf)),
        # ('cat', cat_clf)
    ]
,voting = "soft"))


voting_clf.fit(x_train,y_train)

In [53]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(voting_clf.predict(x_test),
                            columns=y_test.columns)
#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [54]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     88.887895
precision_score    74.033158
recall_score       61.726316
f1_score           64.030526
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",92.42,46.21,50.0,48.03
Security professional,93.07,46.58,49.95,48.21
Database administrator,89.21,63.46,50.59,48.57
"Developer, back-end",69.68,67.91,64.83,65.25
System administrator,89.04,71.57,53.81,54.44
Cloud infrastructure engineer,89.04,73.37,61.79,64.89
"Developer, full-stack",77.63,75.28,74.5,74.85
DevOps specialist,88.23,75.38,60.17,63.19
"Developer, front-end",85.33,76.53,63.57,66.62
Data or business analyst,90.67,77.53,57.99,60.97


### Log voting classifier

In [51]:
# save the model, model details and model's description
model = {"model_description": "Voting Classifier",
         "model_details": str(voting_clf),
         "model_object": voting_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [55]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [56]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Retrieve Runs and best model

In [26]:
runs = mlflow.search_runs([exp.experiment_id])
runs[['run_id','tags.mlflow.runName','metrics.precision_score','metrics.recall_score','metrics.accuracy_score','metrics.f1_score']]

Unnamed: 0,run_id,tags.mlflow.runName,metrics.precision_score,metrics.recall_score,metrics.accuracy_score,metrics.f1_score
0,14f2d561654142feaf32a7a1d47c1eab,"Random Forest, Tuned, multilabel, Data resampled",74.42,55.937368,89.995789,57.1
1,4a979e5999d744638e97e8dbd3fd6c6d,"Cat Boost, multilabel, Data resampled",75.947895,61.707368,90.594737,64.355263
2,9cffb6ca850445df8a7c0b0c6030668b,"Gradient Boost, multilabel, Data resampled",75.041053,62.416842,90.427368,64.994737
3,83457e0863644b509a68dc0fa535edad,"Decision Tree, multilabel, Data resampled",60.556316,60.264737,85.354211,60.346842
4,fab87b6f6c6d4474ba178b073563d788,"Random Forest, multilabel, Data resampled",78.516842,56.937895,90.108947,58.417368
5,92737765318b4ab1850ca61b7b7cb2ac,"Baseline model: Logistic Regression, multilabe...",72.584737,62.111053,90.004211,64.568421
6,7a2197b6f50342c7af17753b04ea0851,Voting Classifier,74.033158,61.726316,88.887895,64.030526
7,e8096d83b33b4a11843016021e153901,"Random Forest, Tuned, multilabel, Data resampled",75.008947,57.132632,88.493684,58.079474
8,c606d229be8b41f5a6917e7e94ae07f5,"Cat Boost, multilabel, Data resampled",76.108421,62.047368,89.145789,64.588947
9,88b17d3ec37f462fa282494d11b3fe86,"Gradient Boost, multilabel, Data resampled",73.109474,62.439474,88.781053,64.650526


In [58]:
best_run= runs.sort_values('metrics.precision_score',ascending=False).iloc[0]

In [59]:
best_run

run_id                                      c606d229be8b41f5a6917e7e94ae07f5
experiment_id                                             302056893493878655
status                                                              FINISHED
artifact_uri               file:///C:/Users/Ali/Desktop/DS Projects/Tech ...
start_time                                  2024-01-27 10:24:16.822000+00:00
end_time                                    2024-01-27 10:24:16.910000+00:00
metrics.precision_score                                            76.108421
metrics.accuracy_score                                             89.145789
metrics.f1_score                                                   64.588947
metrics.recall_score                                               62.047368
tags.mlflow.runName                    Cat Boost, multilabel, Data resampled
tags.mlflow.user                                                         Ali
tags.mlflow.source.name    C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...

In [50]:
artifact_path = best_run["artifact_uri"].replace("file:///", "")

In [51]:
model_pkl = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_pkl, "rb") as f:
    model = pickle.load(f)

model['model_object']

<catboost.core.CatBoostClassifier at 0x1c43698fd10>