In [1]:
# Constants
DATA_PATH   = "../Data/Processed/3_engineered_df.pkl"

TECH_JOBS = ['Techjobs']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "tech_jobs_predictions"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import logging
import pickle
from pathlib import Path
import os

import mlflow
from mlflow.tracking import MlflowClient
 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler


from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score,ConfusionMatrixDisplay,classification_report
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate,cross_val_predict, GridSearchCV

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_____

### Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        if metric_function == accuracy_score:
            quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        else: 
            quality_scores[col] = round(metric_function(role_truth, role_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [4]:
def calculate_scores(clf, x, y):
    y_pred = clf.predict(x)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro',zero_division=0.0)
    recall = recall_score(y, y_pred, average='macro',zero_division=0.0)
    f1 = f1_score(y, y_pred, average='macro',zero_division=0.0)
    
    return {'accuracy': accuracy,
            'precision':precision,
            'recall': recall,
           'f1' : f1}

_____

In [5]:
# Load dataset and make a copy
eng_df = pd.read_pickle(DATA_PATH)
df = eng_df.copy()

In [6]:
df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_25,skills_group_26,skills_group_27,skills_group_28,skills_group_29,skills_group_3,skills_group_30,skills_group_31,skills_group_32,skills_group_33,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,3,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,2,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,2,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
10,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,4,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,7,4,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,3,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,2,0,4,1,0,3,0,1,0,0,0,0,0,0,0,2,2,0,1,0,2,0,0,0,0,0,2,0,3,0,1,0,1,0,2,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,9,0,1,4,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,2,0,1,0,0
73265,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,2,4,0,5,0,0,1,0,0,0,1,2,0,1,0,0,0,1,1,1,0,1,0,2,0,1,0,0,0,1,0,0,7,3,0,1,0,0


## Create a Test Set

In [7]:
X = df.drop(df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [8]:
# We will split using traditional train_test_split because we are dealing with multilabel data
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y,test_size=0.20,
                                                    random_state=42)

## Deal with Imbalanced Dataset

- **To deal with imbalance, I tried different methods to see which one performs the best**
    - **Didn't apply any modifications to the Dataset both as multilabel or as multiclass.**
    - **SMOTE by converting the Dataset to multiclass insted of mutilabel by taking rows that have only 1 value.**
    - **Using random sample method by pandas that returns random samples to try to balance the dataset.** `Best Performer`

### 1-Random Sample

In [9]:
# DownSample majority classes and OverSample minority Classes of training set
samples_per_class = 600
resampled_jobs = []

for job in y_train.columns:
    sub_df = y_train.loc[y_train[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [10]:
y_train = pd.concat(resampled_jobs)
x_train = x_train.loc[y_train.index].copy()
y_train.sum(axis=0).sort_values()

Blockchain                                        751
Developer, game or graphics                       769
Security professional                             840
Developer, QA or test                             841
Developer, embedded applications or devices      1035
Scientist                                        1060
Data or business analyst                         1142
Database administrator                           1152
Developer, mobile                                1250
Engineer, data                                   1261
Academic researcher                              1284
System administrator                             1314
Data scientist or machine learning specialist    1373
Cloud infrastructure engineer                    1429
DevOps specialist                                1563
Developer, desktop or enterprise applications    1815
Developer, front-end                             1966
Developer, full-stack                            3829
Developer, back-end         

### 2- MultiClass with SMOTE

### Intialize MLflow

In [11]:
# Create Directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [12]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

## Create a Baseline Model

In [103]:
# Create a Logistic Regression Classifier, Used StandardScaler because logistic Regression uses l2 regression by default
log_clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))
log_clf.fit(x_train, y_train)

### Evaluating the multilabel dataset

In [104]:
#create a dataframe contains the predictions to evaluate the training set
predictions =  pd.DataFrame(log_clf.predict(x_train),
                            columns=y_train.columns)

#create a dict for each type of score contains a dicts if each label and its value 
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [105]:
print(mean_train_scores)
train_scores

accuracy_score     89.204211
precision_score    78.707368
recall_score       64.196842
f1_score           67.116842
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Data scientist or machine learning specialist,91.63,82.31,74.34,77.54
"Engineer, data",89.57,75.04,58.2,60.96
Data or business analyst,91.11,78.62,61.45,65.33
"Developer, back-end",71.34,69.42,66.39,66.99
Database administrator,90.32,76.14,55.32,57.08
"Developer, mobile",93.56,86.54,77.65,81.28
"Developer, full-stack",77.3,74.74,72.98,73.67
Cloud infrastructure engineer,89.59,78.4,66.59,70.3
"Developer, embedded applications or devices",92.22,80.28,63.19,67.57
"Developer, QA or test",92.68,82.07,50.58,49.27


In [106]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(log_clf.predict(x_test),
                            columns=y_test.columns)

#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [107]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     91.528421
precision_score    68.189474
recall_score       62.951053
f1_score           64.251579
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Database administrator,95.46,57.93,52.19,52.98
Security professional,98.14,58.3,51.83,52.77
System administrator,94.55,62.0,55.37,57.12
Blockchain,97.36,62.45,65.04,63.61
Cloud infrastructure engineer,92.18,63.47,63.76,63.62
"Developer, QA or test",96.92,63.5,50.49,50.24
"Developer, back-end",65.85,64.41,63.45,63.62
"Developer, game or graphics",96.96,65.45,66.19,65.81
Scientist,97.07,67.45,64.38,65.75
DevOps specialist,92.22,67.84,62.76,64.75


### Evaluating the multiclass Dataset

### Log Baseline Model

In [18]:
# Save the model's dataset trained on
data_details = {
#For multilabel Dataset
                "data_path": DATA_PATH,
                "training_set": x_train.index.tolist(),
                "test_indices":     x_test.index.tolist(), 
                "features_names":   x_train.columns.tolist(),
                "targets_names":    y_train.columns.tolist()
#For multiclass Dataset    
                # "x_train": x_train,
                # "x_test":x_test,
                # "y_train":y_train,
                # "y_test": y_test
}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [19]:
# save the model, model details and model's description
model = {"model_description": "Baseline model: Logistic Regression, multilabel, train set only resampled ",
         "model_details": str(log_clf),
         "model_object": log_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [20]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                    "test_scores" : test_scores} 


with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [21]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Random Forest Model

In [108]:
#Create a random forest classifier
rf_clf = make_pipeline(#RobustScaler(),
                       #PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=-1,
                                              verbose=1,
                                              random_state=42))

rf_clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.8s finished


### Evaluating the multilabel dataset

In [109]:
#create a dataframe contains the predictions to evaluate the training set
predictions =  pd.DataFrame(rf_clf.predict(x_train),
                            columns=y_train.columns)

#create a dict for each type of score contains a dicts if each label and its value 
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.7s finished


In [110]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     99.992105
precision_score    99.985263
recall_score       99.981579
f1_score           99.983684
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, embedded applications or devices",99.98,99.9,99.99,99.95
"Developer, desktop or enterprise applications",99.98,99.94,99.99,99.97
"Developer, mobile",99.99,99.96,100.0,99.98
System administrator,99.98,99.96,99.96,99.96
"Developer, back-end",99.99,99.99,99.99,99.99
"Developer, front-end",99.98,99.99,99.95,99.97
Cloud infrastructure engineer,99.99,99.99,99.97,99.98
DevOps specialist,99.99,99.99,99.97,99.98
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,100.0,100.0,100.0,100.0


In [111]:
#create a dataframe contains the predictions to evaluate the test set
predictions =  pd.DataFrame(rf_clf.predict(x_test),
                            columns=y_test.columns)
#create a dict for each type of score contains a dicts if each label and its value
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}

#Convert the dict to dataframe
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.5s finished


In [112]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     92.037368
precision_score    69.524211
recall_score       56.355789
f1_score           57.737368
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
System administrator,95.42,47.71,50.0,48.83
Database administrator,96.0,48.05,49.95,48.98
"Developer, QA or test",96.96,48.48,50.0,49.23
Blockchain,98.33,49.17,50.0,49.58
Security professional,98.35,49.18,49.99,49.58
"Developer, back-end",65.75,64.72,61.87,61.68
Cloud infrastructure engineer,94.29,70.31,53.47,54.94
Scientist,97.55,72.21,55.88,58.99
"Developer, full-stack",72.62,72.39,72.04,72.15
"Developer, front-end",80.93,73.67,61.33,63.27


### Evaluating the multiclass Dataset

### Log RandomForest Model

In [89]:
# save the model, model details and model's description
model = {"model_description": "Random Forest, multilabel,RTS, train set only resampled",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [90]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [91]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Decision Tree

In [30]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier(random_state=42)

dec_clf.fit(x_train,y_train)

### Evaluating the multilabel dataset

In [31]:
# Evaluate on train set
predictions =  pd.DataFrame(dec_clf.predict(x_train),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [32]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     99.994737
precision_score    99.986316
recall_score       99.991053
f1_score           99.988947
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, embedded applications or devices",99.98,99.9,99.99,99.95
"Developer, desktop or enterprise applications",99.98,99.94,99.99,99.97
"Developer, mobile",99.99,99.96,100.0,99.98
System administrator,99.99,99.96,100.0,99.98
"Developer, back-end",99.99,99.99,99.99,99.99
"Developer, front-end",99.99,99.99,99.97,99.98
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,100.0,100.0,100.0,100.0
"Developer, game or graphics",100.0,100.0,100.0,100.0
Security professional,99.99,100.0,99.94,99.97


In [33]:
# Evaluate on test set
predictions =  pd.DataFrame(dec_clf.predict(x_test),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [34]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     85.720526
precision_score    56.673684
recall_score       59.945789
f1_score           57.505263
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",89.02,49.88,49.68,49.16
Security professional,92.95,50.53,51.78,50.22
System administrator,87.36,51.88,53.73,52.0
Database administrator,88.07,51.95,54.44,52.06
Blockchain,94.34,53.3,59.07,54.42
Data or business analyst,91.26,54.55,57.71,55.5
Cloud infrastructure engineer,85.67,54.68,59.33,55.52
"Engineer, data",89.58,55.15,59.96,56.35
"Developer, game or graphics",93.63,55.19,62.35,56.9
DevOps specialist,84.57,55.47,59.45,56.38


### Log Decision Tree

In [35]:
# Model
model = {"model_description": "Decision Tree, multilabel, train set only resampled",
         "model_details": str(dec_clf),
         "model_object": dec_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [36]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [37]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

### Evaluating the multiclass Dataset

### Log Decision Tree

In [35]:
# Model
model = {"model_description": "Decision Tree, multilabel, train set only resampled",
         "model_details": str(dec_clf),
         "model_object": dec_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [36]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [37]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Gradient Boost

In [14]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier


base_estimator = GradientBoostingClassifier(random_state=42)
gd_clf = OneVsRestClassifier(base_estimator)
gd_clf.fit(x_train,y_train)

### Evaluating the multilabal Dataset

In [15]:
# Evaluate on train set
predictions =  pd.DataFrame(gd_clf.predict(x_train),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [16]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     90.250526
precision_score    85.678421
recall_score       66.398947
f1_score           69.923684
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",74.01,73.12,68.86,69.71
"Developer, full-stack",79.57,77.27,76.16,76.64
"Developer, desktop or enterprise applications",86.86,80.85,62.55,66.04
"Developer, front-end",86.81,81.15,66.64,70.41
DevOps specialist,89.58,83.09,66.79,71.21
Cloud infrastructure engineer,90.77,83.2,69.1,73.54
Data or business analyst,91.77,84.11,62.63,67.26
Data scientist or machine learning specialist,92.83,84.76,79.21,81.64
"Engineer, data",90.8,85.08,61.08,65.29
System administrator,90.28,85.44,60.06,63.89


In [17]:
# Evaluate on test set
predictions =  pd.DataFrame(gd_clf.predict(x_test),
                            columns=y_test.columns)
test_scores = {score.__name__: calculate_quality(y_test, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
test_scores = pd.concat(test_scores,axis=1)
mean_test_scores = test_scores.mean()

In [18]:
print(test_scores.mean())
test_scores.sort_values("precision_score")

accuracy_score     91.885263
precision_score    68.851053
recall_score       62.860000
f1_score           64.204211
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",96.92,48.48,49.98,49.22
Database administrator,95.73,54.59,50.6,50.38
Security professional,98.34,61.69,50.31,50.22
Blockchain,97.4,62.6,64.75,63.58
Cloud infrastructure engineer,92.82,65.26,63.66,64.4
System administrator,95.09,65.65,54.08,55.88
"Developer, back-end",66.99,65.7,64.41,64.61
"Developer, game or graphics",97.25,68.57,69.18,68.87
Scientist,97.22,69.34,65.77,67.36
DevOps specialist,92.73,70.06,62.24,64.96


### Log Gradient Boost

In [29]:
# Model
model = {"model_description": "Gradient Boost, multilabel, train set only resampled",
         "model_details": str(gd_clf),
         "model_object": gd_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [30]:
# Performance details
classes_metrics = {"train_scores": train_scores,
                  "test_scores" : test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [31]:
# Start a run in the experiment and track current model
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in mean_test_scores.items():
        mlflow.log_metric(metric, score)

## Retrieve Runs and best model

In [32]:
runs = mlflow.search_runs([exp.experiment_id])
runs[['run_id','tags.mlflow.runName','metrics.precision_score','metrics.recall_score','metrics.accuracy_score','metrics.f1_score']]

Unnamed: 0,run_id,tags.mlflow.runName,metrics.precision_score,metrics.recall_score,metrics.accuracy_score,metrics.f1_score
0,ca436ce1cf414d70a37def489b271b43,"Gradient Boost, multilabel, train set only res...",68.851053,62.86,91.885263,64.204211
1,810bd3168f9c410bad286736050dad53,"Gradient Boost, multilabel, train set only res...",68.851053,62.86,91.885263,64.204211
2,fda885201a804af69e60e9767077dce9,"Random Forest, multilabel,PCA, STD, train set ...",70.031579,54.205263,91.721579,54.839474
3,a263dfe374444828a28d35548822f6bd,"Random Forest, multilabel,STD, train set only ...",70.889474,57.63,92.128947,59.386842
4,954ee705b92349dbb7c112cc9db6ca74,"Decision Tree, multilabel, train set only resa...",56.673684,59.945789,85.720526,57.505263
5,5ec495d9f1e244df9ab35367bb39ac97,"Random Forest, multilabel, train set only resa...",71.314737,57.677895,92.143158,59.455789
6,90340e7d68d04d92b62fa52ab8ce4172,"Baseline model: Logistic Regression, multilabe...",68.189474,62.951053,91.528421,64.251579


In [41]:
best_run= runs.sort_values('metrics.precision_score',ascending=False).iloc[0]

In [42]:
best_run

run_id                                      5ec495d9f1e244df9ab35367bb39ac97
experiment_id                                             302056893493878655
status                                                              FINISHED
artifact_uri               file:///C:/Users/Ali/Desktop/DS Projects/Tech ...
start_time                                  2024-01-23 23:29:56.061000+00:00
end_time                                    2024-01-23 23:29:56.375000+00:00
metrics.f1_score                                                   59.455789
metrics.precision_score                                            71.314737
metrics.recall_score                                               57.677895
metrics.accuracy_score                                             92.143158
tags.mlflow.user                                                         Ali
tags.mlflow.runName        Random Forest, multilabel, train set only resa...
tags.mlflow.source.type                                                LOCAL

In [43]:
artifact_path = best_run["artifact_uri"].replace("file:///", "")

In [44]:
model_pkl = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_pkl, "rb") as f:
    model = pickle.load(f)

model['model_object']