In [1]:
# Constants
DATA_PATH   = "../Data/Processed/3_engineered_df.pkl"

TECH_JOBS = ['Techjobs']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "tech_jobs_predictions"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import logging
import pickle
from pathlib import Path
from collections import Counter
import os

import mlflow
from mlflow.tracking import MlflowClient
 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler


from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score,ConfusionMatrixDisplay,classification_report
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate,cross_val_predict, GridSearchCV, RandomizedSearchCV

from scipy.stats import randint as sp_randint

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_____

### Functions

In [3]:
# Functions to calculate metric functions across all columns in a multi-label dataset
def calculate_metrics(clf, x, y, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    #create a dataframe contains the predictions 
    predictions =  pd.DataFrame(clf.predict(x),
                                columns=y.columns)
    
    #create a dict for each type of metric contains a dicts of each label and its value 
    final_scores = {metric.__name__: predictions_per_col(predictions, y, metric) 
            for metric in metrics}
    
    #Convert the dict to dataframe
    final_scores = pd.concat(final_scores,axis=1)
    mean_final_scores = final_scores.mean()
    
    return final_scores, mean_final_scores

def predictions_per_col(predictions, y, metric_function):
    metric_scores = {}
    for col in predictions.columns:
        truth = y[col].copy()
        pred  = predictions[col].copy()
        
        metric_scores[col] = calculate_metric(truth, pred, metric_function)

    metric_scores = pd.Series(metric_scores.values(), index=metric_scores.keys())
    
    return metric_scores

def calculate_metric(truth, pred, metric_function):
    if metric_function == accuracy_score:
        metric_score = round(metric_function(truth, pred) * 100, 2)
    else: 
        metric_score = round(metric_function(truth, pred,zero_division=0,average='macro') * 100, 2)
    return metric_score

In [4]:
# Function that calculate all metrics for a cross_validate function for multiclass classification
def calculate_scores(clf, x, y):
    y_pred = clf.predict(x)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro',zero_division=0.0)
    recall = recall_score(y, y_pred, average='macro',zero_division=0.0)
    f1 = f1_score(y, y_pred, average='macro',zero_division=0.0)
    
    return {'accuracy': accuracy,
            'precision':precision,
            'recall': recall,
           'f1' : f1}

# Function that calculate all metrics for a cross_validate function for multi-label classification
def calculate_scores_multi_label(clf, x, y, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    #create a dataframe contains the predictions 
    predictions =  pd.DataFrame(clf.predict(x),
                                columns=y.columns)
    
    #create a dict for each type of metric contains a dicts of each label and its value 
    final_scores = {metric.__name__: predictions_per_col(predictions, y, metric) 
            for metric in metrics}
    
    #Convert the dict to dataframe
    final_scores = pd.concat(final_scores,axis=1)
    mean_final_scores = final_scores.mean()
    
    return {'accuracy': mean_final_scores[0],
            'precision':mean_final_scores[1],
            'recall': mean_final_scores[2],
           'f1' : mean_final_scores[3]}

In [5]:
# Functions responisble for Grid_search wtih multi_label dataset
def f1_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(f1_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

def precision_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(precision_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

In [6]:
# Function to log Data, Model, Metrics and Track models.
def log_data(x_train,y_train,x_test,y_test):
    # Save the model's dataset trained on
    data_details = {
    #For multilabel Dataset
                    "data_path": DATA_PATH,
                    "training_set": x_train.index.tolist(),
                    "test_indices":     x_test.index.tolist(), 
                    "features_names":   x_train.columns.tolist(),
                    "targets_names":    y_train.columns.tolist()
    #For multiclass Dataset    
                    # "x_train": x_train,
                    # "x_test":x_test,
                    # "y_train":y_train,
                    # "y_test": y_test
    }

    with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
        pickle.dump(data_details, output_file)
        
        
def log_model(clf,model_description=''):
    # save the model, model details and model's description
    model = {"model_description": model_description,
             "model_details": str(clf),
             "model_object": clf} 

    with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
        pickle.dump(model, output_file)
        
    return model
        
def log_metrics(train_scores, test_scores):
    # save the model metrics
    classes_metrics = {"train_scores": train_scores,
                        "test_scores" : test_scores} 


    with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
        pickle.dump(classes_metrics, output_file)

def track_model(model, scores):
    # Start a run in the experiment and track current model
    with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
        # Track pickle files
        mlflow.log_artifacts(LOG_PATH)

        # Track metrics 
        for metric, score in scores.items():
            mlflow.log_metric(metric, score)

_____

### Load Dataset

In [7]:
# Load dataset and make a copy
eng_df = pd.read_pickle(DATA_PATH)
df = eng_df.copy()

In [8]:
df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_25,skills_group_26,skills_group_27,skills_group_28,skills_group_29,skills_group_3,skills_group_30,skills_group_31,skills_group_32,skills_group_33,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,3,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,2,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,2,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
10,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,4,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,7,4,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,3,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,2,0,4,1,0,3,0,1,0,0,0,0,0,0,0,2,2,0,1,0,2,0,0,0,0,0,2,0,3,0,1,0,1,0,2,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,9,0,1,4,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,2,0,1,0,0
73265,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,2,4,0,5,0,0,1,0,0,0,1,2,0,1,0,0,0,1,1,1,0,1,0,2,0,1,0,0,0,1,0,0,7,3,0,1,0,0


In [9]:
X = df.drop(df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = df[TECH_JOBS]

## Deal with Imbalanced Dataset

- **To deal with imbalance, I tried different methods to see which one performs the best**
    - **SMOTE by converting the Dataset to multiclass instead of mutilabel by taking rows that have only 1 value.**
    - **Using random sample method by pandas that returns random samples to try to balance the dataset.** `Best Performer`

### 1.Random Sampling

In [10]:
# Create a dataframes contains rows with only one job per row
one_job_df = y[y['Techjobs'].sum(axis=1) ==1].droplevel(0,axis=1)
# Create a dataframes contains rows with only multiple job per row
multi_job_df = y[y['Techjobs'].sum(axis=1) !=1].droplevel(0,axis=1)

In [11]:
# Get a list of labels sorted in ascending order
jobs = one_job_df.sum(axis=0).sort_values().index
jobs

Index(['Blockchain', 'Database administrator', 'Scientist',
       'Security professional', 'Cloud infrastructure engineer',
       'Developer, game or graphics', 'System administrator',
       'DevOps specialist', 'Data or business analyst',
       'Developer, QA or test', 'Engineer, data', 'Academic researcher',
       'Data scientist or machine learning specialist',
       'Developer, embedded applications or devices',
       'Developer, desktop or enterprise applications', 'Developer, mobile',
       'Developer, front-end', 'Developer, back-end', 'Developer, full-stack'],
      dtype='object')

In [12]:
# oversample the minority classes from the multi_job dataframe
# downsample the majority classes if they are above the threshold number.
samples_per_class = 800
resampled_jobs = []

for job in jobs:
    #get the one job rows of this class
    sub_df = one_job_df.loc[one_job_df[job] == 1].copy()
    
    # if no. of sub_df < threshold
    if len(sub_df) < samples_per_class:
        # get multi_job rows of this class 
        temp_df = multi_job_df.loc[multi_job_df[job] ==1].copy()
        # oversample no. of rows from multi_job df
        no_rows = min(samples_per_class - len(sub_df), len(temp_df))
        temp_df = temp_df.sample(no_rows,random_state=42)
        # merge both dfs
        sub_df = pd.concat([sub_df,temp_df])
        
    else:
        #if no. of sub_df > threshold, down sample this class
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [13]:
y = pd.concat(resampled_jobs)
X = X.loc[y.index].copy()
y.sum(axis=0).sort_values()

Blockchain                                        651
Security professional                             728
Developer, game or graphics                       870
Developer, QA or test                             928
Database administrator                           1056
Developer, embedded applications or devices      1060
Scientist                                        1104
Data or business analyst                         1116
Developer, mobile                                1120
Developer, front-end                             1174
Engineer, data                                   1194
System administrator                             1248
Cloud infrastructure engineer                    1294
DevOps specialist                                1385
Data scientist or machine learning specialist    1425
Academic researcher                              1451
Developer, desktop or enterprise applications    1465
Developer, full-stack                            2707
Developer, back-end         

In [14]:
len(y)

14761

### 2- MultiClass with SMOTE

## Create a Test set

In [15]:
# Convert dfs to arrays
y_array = y.values
x_array = X.values

In [16]:
# Split the dataset using stratified split using iterative_train_test_split
from skmultilearn.model_selection import iterative_train_test_split
x_train, y_train, x_test, y_test = iterative_train_test_split(x_array, y_array, test_size = 0.2)

In [17]:
#Convert back to dfs
y_train = pd.DataFrame(y_train,columns=y.columns)
y_test = pd.DataFrame(y_test,columns=y.columns)
x_train = pd.DataFrame(x_train,columns=X.columns)
x_test = pd.DataFrame(x_test,columns=X.columns)

## Intialize MLflow

In [18]:
# Create Directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [19]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

## Create a Baseline Model

In [20]:
# Create a Logistic Regression Classifier, Used StandardScaler because logistic Regression uses l2 regression by default
log_clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))
log_clf.fit(x_train, y_train)

### Evaluating the multilabel dataset

In [21]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_score= calculate_metrics(log_clf,x_train,y_train, metrics)

print(mean_train_score)
train_scores.sort_values("precision_score")

accuracy_score     92.453158
precision_score    79.076316
recall_score       63.610000
f1_score           66.825263
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",83.03,71.44,62.03,64.28
Database administrator,92.88,72.26,52.03,52.18
"Developer, QA or test",93.74,72.91,50.82,50.08
"Developer, back-end",82.55,74.26,61.84,64.16
Cloud infrastructure engineer,91.79,75.04,61.65,65.23
Security professional,95.13,76.57,51.82,52.3
Data or business analyst,92.98,76.82,60.05,63.84
"Engineer, data",92.56,77.69,59.61,63.34
System administrator,92.17,78.02,58.06,61.35
"Developer, desktop or enterprise applications",91.24,78.87,61.46,65.38


In [22]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
log_clf_scores = cross_validate(log_clf,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

In [23]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(log_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 90.5,
 'test_precision': 66.03,
 'test_recall': 61.0,
 'test_f1': 58.93}

### Evaluating the multiclass Dataset

### Log Baseline Model

In [24]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(log_clf,'Baseline model: Logistic Regression, multilabel, Data Resampled ')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow
track_model(model,validation_scores)

## Random Forest Model

In [66]:
#Create a random forest classifier
rf_clf = make_pipeline(#StandardScaler(),
                       #PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=-1,
                                              verbose=1,
                                              random_state=42))

rf_clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.8s finished


### Evaluating the multilabel dataset

In [34]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(rf_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished


accuracy_score     99.990526
precision_score    99.994737
recall_score       99.952105
f1_score           99.972632
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, desktop or enterprise applications",99.97,99.95,99.91,99.93
"Developer, full-stack",99.97,99.98,99.93,99.96
"Developer, back-end",99.99,99.99,99.98,99.99
Database administrator,99.97,99.99,99.82,99.9
"Developer, game or graphics",99.98,99.99,99.86,99.92
Data scientist or machine learning specialist,99.99,100.0,99.96,99.98
Blockchain,99.99,100.0,99.9,99.95
"Developer, front-end",100.0,100.0,100.0,100.0
Security professional,99.99,100.0,99.91,99.95
Scientist,100.0,100.0,100.0,100.0


In [35]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
rf_clf_scores = cross_validate(rf_clf,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    4.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.0s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  52 tasks      | elapsed:    1.2s
[Parallel(n_jobs=-1)]:

In [36]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(rf_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 91.8,
 'test_precision': 80.84,
 'test_recall': 66.31,
 'test_f1': 65.28}

### Evaluating the multiclass Dataset

### Log RandomForest Model

In [37]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(rf_clf,'Random Forest, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow
track_model(model,validation_scores)

In [29]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier(random_state=42)

dec_clf.fit(x_train,y_train)

### Evaluating the multilabel dataset

In [30]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(dec_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     99.997368
precision_score    99.995263
recall_score       99.990526
f1_score           99.992105
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Security professional,99.99,99.93,100.0,99.96
"Developer, desktop or enterprise applications",99.99,99.99,99.97,99.98
"Developer, back-end",99.99,99.99,99.99,99.99
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,99.99,100.0,99.93,99.96
"Developer, front-end",100.0,100.0,100.0,100.0
"Developer, game or graphics",99.99,100.0,99.93,99.96
Scientist,100.0,100.0,100.0,100.0
System administrator,100.0,100.0,100.0,100.0
"Developer, QA or test",100.0,100.0,100.0,100.0


In [31]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
dec_clf_scores = cross_validate(dec_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

In [32]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(dec_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 87.78,
 'test_precision': 66.54,
 'test_recall': 76.8,
 'test_f1': 66.7}

### Log Decision Tree

In [33]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(dec_clf,'Decision Tree, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

### Evaluating the multiclass Dataset

## Gradient Boost

In [97]:
# Train a Gradient Boosting Classifier
gd_clf = MultiOutputClassifier(estimator=GradientBoostingClassifier(n_estimators=100,max_depth=3,verbose=1,random_state=42))
gd_clf.fit(x_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           0.5670            9.88s
         2           0.5287            7.92s
         3           0.4992            8.61s
         4           0.4773            8.34s
         5           0.4587            7.87s
         6           0.4440            7.78s
         7           0.4310            7.68s
         8           0.4199            7.42s
         9           0.4112            7.31s
        10           0.4025            7.13s
        20           0.3591            6.46s
        30           0.3427            5.60s
        40           0.3302            4.90s
        50           0.3228            4.10s
        60           0.3168            3.25s
        70           0.3107            2.44s
        80           0.3048            1.65s
        90           0.3001            0.82s
       100           0.2954            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.5326            6.79s
        

### Evaluating the multilabal Dataset

In [39]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(gd_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     92.770000
precision_score    85.544737
recall_score       63.216842
f1_score           66.781053
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",84.49,76.57,63.3,66.2
Cloud infrastructure engineer,92.24,79.55,61.07,65.15
"Developer, back-end",83.18,79.76,60.36,62.52
Data or business analyst,93.08,79.79,58.21,61.78
Scientist,93.74,82.11,63.72,68.54
DevOps specialist,92.44,82.32,65.44,70.16
"Developer, desktop or enterprise applications",91.64,83.18,61.53,65.88
Academic researcher,92.34,83.71,66.12,71.02
Data scientist or machine learning specialist,93.5,83.76,75.21,78.68
"Engineer, data",92.99,83.95,59.65,63.85


In [49]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
gd_clf_scores = cross_validate(gd_clf,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

      Iter       Train Loss   Remaining Time 
         1           0.6960            0.63s
         2           0.6482            0.54s
         3           0.6153            0.36s
         4           0.5918            0.61s
         5           0.5677            0.60s
         6           0.5453            0.59s
         7           0.5297            0.59s
         8           0.5150            0.64s
         9           0.5040            0.64s
        10           0.4935            0.59s
        20           0.4391            0.52s
        30           0.4088            0.40s
        40           0.3938            0.34s
        50           0.3837            0.29s
        60           0.3762            0.22s
        70           0.3696            0.17s
        80           0.3635            0.11s
        90           0.3575            0.06s
       100           0.3523            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.6483            0.00s
        

In [50]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(gd_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 90.98,
 'test_precision': 68.63,
 'test_recall': 59.93,
 'test_f1': 58.44}

### Log Gradient Boost

In [42]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(gd_clf,'Gradient Boost, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Cat Boost

In [70]:
cat_clf = CatBoostClassifier(loss_function='MultiLogloss',
                            eval_metric='HammingLoss',
                            verbose=1)
cat_clf.fit(x_train, y_train)

Learning rate set to 0.029592
0:	learn: 0.0878972	total: 141ms	remaining: 2m 20s
1:	learn: 0.0886043	total: 295ms	remaining: 2m 27s
2:	learn: 0.0885909	total: 429ms	remaining: 2m 22s
3:	learn: 0.0886399	total: 526ms	remaining: 2m 10s
4:	learn: 0.0886399	total: 662ms	remaining: 2m 11s
5:	learn: 0.0886354	total: 793ms	remaining: 2m 11s
6:	learn: 0.0886221	total: 923ms	remaining: 2m 10s
7:	learn: 0.0886354	total: 1.07s	remaining: 2m 12s
8:	learn: 0.0886399	total: 1.2s	remaining: 2m 12s
9:	learn: 0.0886399	total: 1.33s	remaining: 2m 12s
10:	learn: 0.0886399	total: 1.48s	remaining: 2m 12s
11:	learn: 0.0886399	total: 1.62s	remaining: 2m 13s
12:	learn: 0.0886399	total: 1.75s	remaining: 2m 13s
13:	learn: 0.0886354	total: 1.89s	remaining: 2m 12s
14:	learn: 0.0886399	total: 2.02s	remaining: 2m 12s
15:	learn: 0.0886399	total: 2.16s	remaining: 2m 12s
16:	learn: 0.0886399	total: 2.32s	remaining: 2m 14s
17:	learn: 0.0886399	total: 2.46s	remaining: 2m 13s
18:	learn: 0.0886399	total: 2.59s	remaining: 

<catboost.core.CatBoostClassifier at 0x21c1c8ca5d0>

### Evaluate Cat Boost

In [51]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(cat_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     94.848421
precision_score    93.167368
recall_score       72.825789
f1_score           78.231579
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",87.9,88.01,71.84,76.32
"Developer, full-stack",90.39,88.94,76.98,81.18
"Developer, desktop or enterprise applications",92.91,89.36,66.79,72.53
"Developer, embedded applications or devices",95.88,90.02,76.01,81.28
Academic researcher,94.62,91.05,76.1,81.5
Scientist,95.67,91.75,74.24,80.27
DevOps specialist,94.84,91.86,75.42,81.18
Cloud infrastructure engineer,94.82,92.02,73.01,79.18
"Developer, front-end",96.29,92.23,80.61,85.31
Data scientist or machine learning specialist,96.34,92.44,85.48,88.56


In [52]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
cat_clf_scores = cross_validate(cat_clf,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

Learning rate set to 0.024888
0:	learn: 0.0744313	total: 131ms	remaining: 2m 11s
1:	learn: 0.0745047	total: 266ms	remaining: 2m 12s
2:	learn: 0.0745247	total: 399ms	remaining: 2m 12s
3:	learn: 0.0748649	total: 499ms	remaining: 2m 4s
4:	learn: 0.0755186	total: 630ms	remaining: 2m 5s
5:	learn: 0.0755186	total: 755ms	remaining: 2m 5s
6:	learn: 0.0755053	total: 916ms	remaining: 2m 9s
7:	learn: 0.0755186	total: 1.08s	remaining: 2m 13s
8:	learn: 0.0754853	total: 1.22s	remaining: 2m 14s
9:	learn: 0.0755186	total: 1.35s	remaining: 2m 13s
10:	learn: 0.0755186	total: 1.48s	remaining: 2m 13s
11:	learn: 0.0755186	total: 1.62s	remaining: 2m 13s
12:	learn: 0.0755186	total: 1.76s	remaining: 2m 13s
13:	learn: 0.0755186	total: 1.91s	remaining: 2m 14s
14:	learn: 0.0755186	total: 2.05s	remaining: 2m 14s
15:	learn: 0.0755186	total: 2.24s	remaining: 2m 17s
16:	learn: 0.0755186	total: 2.42s	remaining: 2m 20s
17:	learn: 0.0755186	total: 2.61s	remaining: 2m 22s
18:	learn: 0.0755120	total: 2.84s	remaining: 2m 

In [53]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(cat_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 91.19,
 'test_precision': 73.24,
 'test_recall': 62.05,
 'test_f1': 60.59}

### Log Cat Boost

In [54]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(cat_clf,'Cat Boost, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Hyperparameter Tuning Random Forest

In [41]:
list(rf_clf.get_params().keys())

['memory',
 'steps',
 'verbose',
 'randomforestclassifier',
 'randomforestclassifier__bootstrap',
 'randomforestclassifier__ccp_alpha',
 'randomforestclassifier__class_weight',
 'randomforestclassifier__criterion',
 'randomforestclassifier__max_depth',
 'randomforestclassifier__max_features',
 'randomforestclassifier__max_leaf_nodes',
 'randomforestclassifier__max_samples',
 'randomforestclassifier__min_impurity_decrease',
 'randomforestclassifier__min_samples_leaf',
 'randomforestclassifier__min_samples_split',
 'randomforestclassifier__min_weight_fraction_leaf',
 'randomforestclassifier__n_estimators',
 'randomforestclassifier__n_jobs',
 'randomforestclassifier__oob_score',
 'randomforestclassifier__random_state',
 'randomforestclassifier__verbose',
 'randomforestclassifier__warm_start']

In [76]:

rf_param_grid = [
    {'randomforestclassifier__n_estimators':[80,90,100,110,120,130],
     'randomforestclassifier__max_depth':[30,35,40,45,50],
     'randomforestclassifier__min_samples_split':[2,3,4,5,6,7,8],
     'randomforestclassifier__min_samples_leaf':[1,3,5,7,9],
     'randomforestclassifier__class_weight':['balanced',None],
     'randomforestclassifier__max_features':['sqrt','log2'],
     'randomforestclassifier__verbose': [0],
    }]
    

rf_clf_grid_search = RandomizedSearchCV(rf_clf, rf_param_grid,cv= 3,n_iter=100, scoring=precision_score_multi_label, return_train_score=True,refit=True,verbose=2)

rf_clf_grid_search.fit(x_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END randomforestclassifier__class_weight=None, randomforestclassifier__max_depth=50, randomforestclassifier__max_features=log2, randomforestclassifier__min_samples_leaf=9, randomforestclassifier__min_samples_split=6, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; total time=   4.4s
[CV] END randomforestclassifier__class_weight=None, randomforestclassifier__max_depth=50, randomforestclassifier__max_features=log2, randomforestclassifier__min_samples_leaf=9, randomforestclassifier__min_samples_split=6, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; total time=   1.3s
[CV] END randomforestclassifier__class_weight=None, randomforestclassifier__max_depth=50, randomforestclassifier__max_features=log2, randomforestclassifier__min_samples_leaf=9, randomforestclassifier__min_samples_split=6, randomforestclassifier__n_estimators=100, randomforestclassifier__verbose=0; to

In [78]:
rf_clf_grid_search.best_estimator_

In [82]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(rf_clf_grid_search.best_estimator_,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     99.104211
precision_score    99.457368
recall_score       94.907368
f1_score           96.981579
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, desktop or enterprise applications",97.63,98.61,88.09,92.55
"Developer, back-end",98.0,98.74,94.99,96.73
"Developer, QA or test",98.32,99.12,86.59,91.81
"Developer, full-stack",98.78,99.27,96.68,97.91
Academic researcher,98.81,99.3,93.97,96.44
"Engineer, data",98.84,99.38,92.83,95.82
"Developer, front-end",98.98,99.39,93.61,96.28
Security professional,99.26,99.42,92.6,95.73
"Developer, embedded applications or devices",99.01,99.47,93.1,96.03
System administrator,99.07,99.5,94.49,96.83


In [84]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
rf_clf_scores = cross_validate(rf_clf_grid_search.best_estimator_,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

In [85]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(rf_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 91.97,
 'test_precision': 84.98,
 'test_recall': 65.35,
 'test_f1': 65.8}

In [87]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(rf_clf_grid_search.best_estimator_,'Random Forest, Tuned, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Voting Classifier

In [20]:
log_clf = make_pipeline(StandardScaler(),
                    LogisticRegression(max_iter=1000))

gd_clf = GradientBoostingClassifier(n_estimators=100,
                                    max_depth=3,
                                    random_state=42,
                                    verbose=0)

rf_clf = RandomForestClassifier(class_weight='balanced', max_depth=40,
                       max_features='log2', min_samples_split=3,
                       n_estimators=120, n_jobs=-1, random_state=42)

cat_clf = CatBoostClassifier(verbose=0)

voting_clf = MultiOutputClassifier(VotingClassifier(
    estimators=[
        # ('log', log_clf),
        ('rf', rf_clf),
        ('gd', gd_clf),
        ('cat', cat_clf)
    ] 
,voting = "soft"))


voting_clf.fit(x_train,y_train)

### Evaluate Voting Classifier

In [62]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(voting_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     96.398947
precision_score    97.438421
recall_score       79.192632
f1_score           85.015263
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",91.78,94.81,79.59,84.63
"Developer, desktop or enterprise applications",95.01,95.57,75.74,82.28
"Developer, full-stack",94.63,96.53,85.59,89.88
Academic researcher,96.65,97.05,83.67,88.98
DevOps specialist,96.27,97.07,80.62,86.73
"Developer, front-end",97.46,97.14,85.04,90.03
"Engineer, data",95.93,97.19,75.1,82.2
"Developer, QA or test",94.93,97.43,59.57,64.75
System administrator,95.54,97.68,73.55,80.83
Database administrator,95.49,97.68,68.4,75.72


In [56]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
voting_clf_scores = cross_validate(voting_clf,x_train,y_train, cv=3, scoring =calculate_scores_multi_label)

In [57]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(voting_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 91.43,
 'test_precision': 74.62,
 'test_recall': 63.04,
 'test_f1': 61.57}

### Log voting classifier

In [58]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(voting_clf,'Voting Classifier')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Retrieve Runs

In [25]:
runs = mlflow.search_runs([exp.experiment_id])
runs[['run_id','tags.mlflow.runName','metrics.test_precision','metrics.test_recall','metrics.test_f1','metrics.test_accuracy']]

Unnamed: 0,run_id,tags.mlflow.runName,metrics.test_precision,metrics.test_recall,metrics.test_f1,metrics.test_accuracy
0,d6010e9458eb4fd5aab149b76df44e1a,"Baseline model: Logistic Regression, multilabe...",66.03,61.0,58.93,90.5
1,366b3b40b46344edb0a8d00c95a3884c,"Random Forest, Tuned, multilabel, Data resampled",84.98,65.35,65.8,91.97
2,d9a069fe4f2a490cad7b4d811be222bb,Voting Classifier,74.62,63.04,61.57,91.43
3,352deeb893cd40ef8fd55216a1b3004d,"Cat Boost, multilabel, Data resampled",73.24,62.05,60.59,91.19
4,5b6f02d285554e1ebd30a9c07435f817,"Gradient Boost, multilabel, Data resampled",68.63,59.93,58.44,90.98
5,f33f70dd5d0c48d48136e8d3e89a3271,"Random Forest, multilabel, Data resampled",80.84,66.31,65.28,91.8
6,ffeeca910ef342959dc27b9a1edb024c,"Baseline model: Logistic Regression, multilabe...",65.89,60.99,58.87,90.47


## Evaluate best models on test set

In [137]:
# Evaluate Random Forest on the test set
metrics=[accuracy_score, precision_score, recall_score, f1_score]
test_scores, mean_test_scores= calculate_metrics(rf_clf_grid_search.best_estimator_,x_test,y_test, metrics)

print(mean_test_scores)
test_scores.sort_values("precision_score")

accuracy_score     92.931579
precision_score    91.562105
recall_score       62.690526
f1_score           67.615789
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, full-stack",83.05,77.65,56.23,56.73
"Developer, desktop or enterprise applications",90.94,83.58,56.75,59.41
"Developer, back-end",82.71,85.23,57.8,58.74
Data scientist or machine learning specialist,93.03,85.37,69.69,74.72
"Developer, front-end",92.93,86.16,57.9,61.57
Scientist,94.77,88.11,69.75,75.5
Academic researcher,93.37,88.83,70.08,75.76
"Developer, game or graphics",95.35,89.75,63.34,69.23
"Developer, embedded applications or devices",93.95,90.65,59.56,64.25
DevOps specialist,92.89,93.82,63.1,68.73


In [98]:
# Evaluate Gradient Boost on the test set
metrics=[accuracy_score, precision_score, recall_score, f1_score]
test_scores, mean_test_scores= calculate_metrics(gd_clf,x_test,y_test, metrics)

print(mean_test_scores)
test_scores.sort_values("precision_score")

accuracy_score     92.235789
precision_score    79.986842
recall_score       62.434737
f1_score           65.455789
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Database administrator,92.75,69.56,51.29,50.8
"Developer, QA or test",93.64,71.87,50.75,49.92
Scientist,92.45,72.19,63.3,66.33
"Developer, full-stack",83.6,73.65,62.93,65.47
"Developer, back-end",82.13,75.78,59.03,60.59
Cloud infrastructure engineer,91.7,75.99,59.19,62.58
"Developer, desktop or enterprise applications",90.74,76.81,59.06,62.33
Data scientist or machine learning specialist,91.73,76.82,72.88,74.64
"Engineer, data",92.45,78.01,58.72,62.25
System administrator,91.97,78.58,56.26,58.85


In [72]:
# Evaluate Cat Boost on the Test set
metrics=[accuracy_score, precision_score, recall_score, f1_score]
test_scores, mean_test_scores= calculate_metrics(cat_clf,x_test,y_test, metrics)

print(mean_test_scores)
test_scores.sort_values("precision_score")

accuracy_score     92.601579
precision_score    82.148947
recall_score       63.180000
f1_score           66.665789
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, QA or test",93.64,46.82,50.0,48.36
"Developer, full-stack",84.04,74.63,64.41,67.12
Scientist,92.86,74.85,63.1,66.73
"Developer, back-end",83.15,77.32,62.47,65.08
"Developer, desktop or enterprise applications",90.81,77.94,58.65,61.87
Data scientist or machine learning specialist,92.65,80.06,74.96,77.19
Cloud infrastructure engineer,92.28,80.98,61.08,65.29
Data or business analyst,93.23,82.79,58.9,62.87
DevOps specialist,92.52,83.73,65.48,70.41
Academic researcher,92.72,83.73,69.87,74.52


In [21]:
# Evaluate the Voting Classifier on the Test set
metrics=[accuracy_score, precision_score, recall_score, f1_score]
test_scores, mean_test_scores= calculate_metrics(voting_clf,x_test,y_test, metrics)

print(mean_test_scores)
test_scores.sort_values("precision_score")

accuracy_score     92.765789
precision_score    86.688947
recall_score       63.718947
f1_score           67.487895
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",82.47,75.69,60.71,62.82
"Developer, full-stack",84.22,76.0,63.23,66.05
"Developer, desktop or enterprise applications",91.08,79.45,60.16,63.89
Data scientist or machine learning specialist,92.89,80.9,75.56,77.9
Scientist,93.78,81.05,66.51,71.11
"Developer, game or graphics",95.42,83.75,68.76,73.8
Academic researcher,93.03,84.58,71.43,76.03
Cloud infrastructure engineer,92.69,85.45,61.82,66.59
Data or business analyst,93.68,86.41,61.4,66.33
"Engineer, data",92.93,86.96,58.6,62.57


## Retrieve Best Run Random Forest

In [154]:
best_run= runs[runs['run_id'] == '366b3b40b46344edb0a8d00c95a3884c'].iloc[0]

In [155]:
best_run

run_id                                      366b3b40b46344edb0a8d00c95a3884c
experiment_id                                             996200319116358272
status                                                              FINISHED
artifact_uri               file:///C:/Users/Ali/Desktop/DS Projects/Tech ...
start_time                                  2024-01-31 11:31:20.708000+00:00
end_time                                    2024-01-31 11:31:34.513000+00:00
metrics.test_recall                                                    65.35
metrics.test_f1                                                         65.8
metrics.test_precision                                                 84.98
metrics.test_accuracy                                                  91.97
tags.mlflow.runName         Random Forest, Tuned, multilabel, Data resampled
tags.mlflow.user                                                         Ali
tags.mlflow.source.name    C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...

In [156]:
artifact_path = best_run["artifact_uri"].replace("file:///", "")

In [157]:
model_pkl = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_pkl, "rb") as f:
    model = pickle.load(f)

model['model_object']