In [1]:
# Constants
DATA_PATH   = "../Data/Processed/3_engineered_df.pkl"

TECH_JOBS = ['Techjobs']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "tech_jobs_predictions"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import logging
import pickle
from pathlib import Path
import os

import mlflow
from mlflow.tracking import MlflowClient
 
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler,RobustScaler


from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score,ConfusionMatrixDisplay,classification_report
from sklearn.model_selection import train_test_split,cross_val_score, cross_validate,cross_val_predict, GridSearchCV, RandomizedSearchCV

from scipy.stats import randint as sp_randint

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_____

### Functions

In [3]:
# Functions to calculate metric functions across all columns in a multi-label dataset
def calculate_metric(truth, pred, metric_function):
    if metric_function == accuracy_score:
        metric_score = round(metric_function(truth, pred) * 100, 2)
    else: 
        metric_score = round(metric_function(truth, pred,zero_division=0,average='macro') * 100, 2)
    return metric_score

def predictions_per_col(predictions, y, metric_function):
    metric_scores = {}
    for col in predictions.columns:
        truth = y[col].copy()
        pred  = predictions[col].copy()
        
        metric_scores[col] = calculate_metric(truth, pred, metric_function)

    metric_scores = pd.Series(metric_scores.values(), index=metric_scores.keys())
    
    return metric_scores

def calculate_metrics(clf, x, y, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    #create a dataframe contains the predictions 
    predictions =  pd.DataFrame(clf.predict(x),
                                columns=y.columns)
    
    #create a dict for each type of metric contains a dicts of each label and its value 
    final_scores = {metric.__name__: predictions_per_col(predictions, y, metric) 
            for metric in metrics}
    
    #Convert the dict to dataframe
    final_scores = pd.concat(final_scores,axis=1)
    mean_final_scores = final_scores.mean()
    
    return final_scores, mean_final_scores

In [4]:
# Function that calculate all metrics for a cross_validate function for multiclass classification
def calculate_scores(clf, x, y):
    y_pred = clf.predict(x)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro',zero_division=0.0)
    recall = recall_score(y, y_pred, average='macro',zero_division=0.0)
    f1 = f1_score(y, y_pred, average='macro',zero_division=0.0)
    
    return {'accuracy': accuracy,
            'precision':precision,
            'recall': recall,
           'f1' : f1}

def calculate_scores_multi_label(clf, x, y, metrics=[accuracy_score, precision_score, recall_score, f1_score]):
    #create a dataframe contains the predictions 
    predictions =  pd.DataFrame(clf.predict(x),
                                columns=y.columns)
    
    #create a dict for each type of metric contains a dicts of each label and its value 
    final_scores = {metric.__name__: predictions_per_col(predictions, y, metric) 
            for metric in metrics}
    
    #Convert the dict to dataframe
    final_scores = pd.concat(final_scores,axis=1)
    mean_final_scores = final_scores.mean()
    
    return {'accuracy': mean_final_scores[0],
            'precision':mean_final_scores[1],
            'recall': mean_final_scores[2],
           'f1' : mean_final_scores[3]}

In [5]:
# Function responisble for Grid_search wtih multi_label dataset
def f1_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(f1_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

def precision_score_multi_label(clf, x, y):
    quality_scores = {}
    predictions =  pd.DataFrame(clf.predict(x),
                            columns=y.columns)
    
    for col in predictions.columns:
        job_pred  = predictions[col].copy()
        job_truth = y[col].copy()

        quality_scores[col] = round(precision_score(job_truth, job_pred,zero_division=0,average='macro') * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    
    # train_scores = pd.concat(train_scores,axis=1)
    mean_f1_score = quality_scores.mean()
    return mean_f1_score

In [6]:
def log_data(x_train,y_train,x_test,y_test):
    # Save the model's dataset trained on
    data_details = {
    #For multilabel Dataset
                    "data_path": DATA_PATH,
                    "training_set": x_train.index.tolist(),
                    "test_indices":     x_test.index.tolist(), 
                    "features_names":   x_train.columns.tolist(),
                    "targets_names":    y_train.columns.tolist()
    #For multiclass Dataset    
                    # "x_train": x_train,
                    # "x_test":x_test,
                    # "y_train":y_train,
                    # "y_test": y_test
    }

    with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
        pickle.dump(data_details, output_file)
        
        
def log_model(clf,model_description=''):
    # save the model, model details and model's description
    model = {"model_description": model_description,
             "model_details": str(clf),
             "model_object": clf} 

    with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
        pickle.dump(model, output_file)
        
    return model
        
def log_metrics(train_scores, test_scores):
    # save the model metrics
    classes_metrics = {"train_scores": train_scores,
                        "test_scores" : test_scores} 


    with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
        pickle.dump(classes_metrics, output_file)

def track_model(model, scores):
    # Start a run in the experiment and track current model
    with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
        # Track pickle files
        mlflow.log_artifacts(LOG_PATH)

        # Track metrics 
        for metric, score in scores.items():
            mlflow.log_metric(metric, score)

_____

In [7]:
# Load dataset and make a copy
eng_df = pd.read_pickle(DATA_PATH)
df = eng_df.copy()

In [8]:
df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_25,skills_group_26,skills_group_27,skills_group_28,skills_group_29,skills_group_3,skills_group_30,skills_group_31,skills_group_32,skills_group_33,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,3,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,2,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,2,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
10,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,4,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,7,4,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,3,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,2,0,4,1,0,3,0,1,0,0,0,0,0,0,0,2,2,0,1,0,2,0,0,0,0,0,2,0,3,0,1,0,1,0,2,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,9,0,1,4,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,2,0,1,0,0
73265,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,2,4,0,5,0,0,1,0,0,0,1,2,0,1,0,0,0,1,1,1,0,1,0,2,0,1,0,0,0,1,0,0,7,3,0,1,0,0


## Create a Test Set

In [9]:
X = df.drop(df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [10]:
# We will split using traditional train_test_split because we are dealing with multilabel data
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y,test_size=0.20,
                                                    random_state=42)

## Deal with Imbalanced Dataset

- **To deal with imbalance, I tried different methods to see which one performs the best**
    - **Didn't apply any modifications to the Dataset both as multilabel or as multiclass.**
    - **SMOTE by converting the Dataset to multiclass insted of mutilabel by taking rows that have only 1 value.**
    - **Using random sample method by pandas that returns random samples to try to balance the dataset.** `Best Performer`

### 1-Random Sample

In [11]:
y_train.sum(axis=0).sort_values()

Blockchain                                         447
Security professional                              485
Developer, game or graphics                        640
Scientist                                          736
Developer, QA or test                              807
Database administrator                             831
Data or business analyst                          1009
System administrator                              1170
Academic researcher                               1243
Engineer, data                                    1329
Developer, embedded applications or devices       1484
Data scientist or machine learning specialist     1546
Cloud infrastructure engineer                     1655
DevOps specialist                                 1985
Developer, mobile                                 2994
Developer, desktop or enterprise applications     3452
Developer, front-end                              6392
Developer, back-end                              12785
Developer,

In [12]:
# DownSample majority classes and OverSample minority Classes of training set
samples_per_class = 600
resampled_jobs = []

for job in y_train.columns:
    sub_df = y_train.loc[y_train[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [13]:
y_train = pd.concat(resampled_jobs)
x_train = x_train.loc[y_train.index].copy()
y_train.sum(axis=0).sort_values()

Blockchain                                        700
Developer, game or graphics                       747
Security professional                             757
Developer, QA or test                             770
Database administrator                            888
Developer, embedded applications or devices       960
Scientist                                         980
Data or business analyst                          982
Developer, mobile                                1035
Engineer, data                                   1036
System administrator                             1061
Cloud infrastructure engineer                    1134
Academic researcher                              1219
Data scientist or machine learning specialist    1227
DevOps specialist                                1265
Developer, front-end                             1365
Developer, desktop or enterprise applications    1445
Developer, full-stack                            3117
Developer, back-end         

In [14]:
# DownSample majority classes and OverSample minority Classes of training set
samples_per_class = 300
resampled_jobs = []

for job in y_test.columns:
    sub_df = y_test.loc[y_test[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [15]:
y_test = pd.concat(resampled_jobs)
x_test = x_test.loc[y_test.index].copy()
y_test.sum(axis=0).sort_values()

Blockchain                                        366
Developer, game or graphics                       372
Security professional                             392
Developer, QA or test                             400
Developer, embedded applications or devices       437
Database administrator                            462
Scientist                                         470
Data or business analyst                          491
Developer, mobile                                 505
Engineer, data                                    527
System administrator                              543
Cloud infrastructure engineer                     593
Data scientist or machine learning specialist     609
DevOps specialist                                 616
Academic researcher                               629
Developer, front-end                              680
Developer, desktop or enterprise applications     742
Developer, full-stack                            1531
Developer, back-end         

### 2- MultiClass with SMOTE

### Intialize MLflow

In [16]:
# Create Directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [17]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

## Create a Baseline Model

In [19]:
# Create a Logistic Regression Classifier, Used StandardScaler because logistic Regression uses l2 regression by default
log_clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))
log_clf.fit(x_train, y_train)

### Evaluating the multilabel dataset

In [20]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_score= calculate_metrics(log_clf,x_train,y_train, metrics)

print(mean_train_score)
train_scores.sort_values("precision_score")

accuracy_score     90.765263
precision_score    78.838947
recall_score       63.910526
f1_score           67.053684
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",75.0,70.7,64.08,65.27
Database administrator,92.24,71.95,53.31,54.29
Security professional,93.38,72.33,53.81,55.33
"Developer, full-stack",79.43,74.45,70.84,72.18
System administrator,91.23,76.57,57.36,60.19
"Engineer, data",91.65,77.65,59.35,62.88
"Developer, desktop or enterprise applications",88.72,77.82,60.56,63.87
Data or business analyst,92.31,78.67,61.53,65.59
DevOps specialist,90.57,78.91,64.57,68.58
"Developer, embedded applications or devices",92.67,78.92,63.98,68.23


In [21]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
log_clf_scores = cross_validate(log_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

In [22]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(log_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 89.21,
 'test_precision': 67.04,
 'test_recall': 61.54,
 'test_f1': 61.12}

### Evaluating the multiclass Dataset

### Log Baseline Model

In [23]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(log_clf,'Baseline model: Logistic Regression, multilabel, Data Resampled ')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow
track_model(model,validation_scores)

## Random Forest Model

In [24]:
#Create a random forest classifier
rf_clf = make_pipeline(#StandardScaler(),
                       #PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=-1,
                                              verbose=1,
                                              random_state=42))

rf_clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    3.1s finished


### Evaluating the multilabel dataset

In [25]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(rf_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished


accuracy_score     99.994737
precision_score    99.993684
recall_score       99.980000
f1_score           99.985789
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Security professional,99.99,99.93,100.0,99.96
"Developer, desktop or enterprise applications",99.98,99.96,99.96,99.96
"Developer, back-end",99.99,99.99,99.99,99.99
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,99.99,100.0,99.93,99.96
"Developer, front-end",99.99,100.0,99.96,99.98
"Developer, game or graphics",99.99,100.0,99.93,99.96
Scientist,99.99,100.0,99.95,99.97
System administrator,100.0,100.0,100.0,100.0
"Developer, QA or test",99.99,100.0,99.94,99.97


In [26]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
rf_clf_scores = cross_validate(rf_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    2.5s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]:

In [27]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(rf_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 92.15,
 'test_precision': 89.16,
 'test_recall': 76.29,
 'test_f1': 78.92}

### Evaluating the multiclass Dataset

### Log RandomForest Model

In [28]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(rf_clf,'Random Forest, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow
track_model(model,validation_scores)

## Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier
dec_clf = DecisionTreeClassifier(random_state=42)

dec_clf.fit(x_train,y_train)

### Evaluating the multilabel dataset

In [30]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(dec_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     99.997368
precision_score    99.995263
recall_score       99.990526
f1_score           99.992105
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Security professional,99.99,99.93,100.0,99.96
"Developer, desktop or enterprise applications",99.99,99.99,99.97,99.98
"Developer, back-end",99.99,99.99,99.99,99.99
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,99.99,100.0,99.93,99.96
"Developer, front-end",100.0,100.0,100.0,100.0
"Developer, game or graphics",99.99,100.0,99.93,99.96
Scientist,100.0,100.0,100.0,100.0
System administrator,100.0,100.0,100.0,100.0
"Developer, QA or test",100.0,100.0,100.0,100.0


In [31]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
dec_clf_scores = cross_validate(dec_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

In [32]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(dec_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 87.78,
 'test_precision': 66.54,
 'test_recall': 76.8,
 'test_f1': 66.7}

### Log Decision Tree

In [33]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(dec_clf,'Decision Tree, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

### Evaluating the multiclass Dataset

## Gradient Boost

In [34]:
# Train a Gradient Boosting Classifier
gd_clf = MultiOutputClassifier(estimator=GradientBoostingClassifier(n_estimators=100,max_depth=3,max_features='sqrt',verbose=1,random_state=42))
gd_clf.fit(x_train,y_train)

      Iter       Train Loss   Remaining Time 
         1           0.6418            0.72s
         2           0.6020            0.78s
         3           0.5766            0.74s
         4           0.5588            0.64s
         5           0.5391            0.51s
         6           0.5209            0.62s
         7           0.5080            0.63s
         8           0.4951            0.60s
         9           0.4837            0.58s
        10           0.4746            0.60s
        20           0.4277            0.51s
        30           0.4032            0.44s
        40           0.3901            0.39s
        50           0.3808            0.33s
        60           0.3738            0.26s
        70           0.3675            0.20s
        80           0.3624            0.13s
        90           0.3572            0.07s
       100           0.3524            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.5917            0.60s
        

### Evaluating the multilabal Dataset

In [35]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(gd_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

accuracy_score     91.189474
precision_score    85.123684
recall_score       63.941053
f1_score           67.456842
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, back-end",75.72,73.64,63.12,64.21
"Developer, full-stack",80.58,76.12,72.36,73.78
"Developer, desktop or enterprise applications",88.96,80.59,60.07,63.45
Cloud infrastructure engineer,91.92,81.88,65.24,69.83
DevOps specialist,90.84,82.0,63.47,67.86
Data scientist or machine learning specialist,92.59,82.49,75.49,78.42
"Developer, front-end",90.68,83.61,65.62,70.28
Data or business analyst,92.64,84.01,60.6,64.99
Scientist,93.45,84.08,67.8,72.82
"Engineer, data",92.24,84.14,60.42,64.69


In [36]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
gd_clf_scores = cross_validate(gd_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

      Iter       Train Loss   Remaining Time 
         1           0.3265            0.00s
         2           0.3053            0.53s
         3           0.2928            0.48s
         4           0.2838            0.48s
         5           0.2749            0.56s
         6           0.2675            0.54s
         7           0.2612            0.53s
         8           0.2553            0.52s
         9           0.2515            0.51s
        10           0.2470            0.50s
        20           0.2218            0.40s
        30           0.2080            0.36s
        40           0.2003            0.32s
        50           0.1953            0.26s
        60           0.1907            0.21s
        70           0.1860            0.15s
        80           0.1822            0.10s
        90           0.1784            0.05s
       100           0.1751            0.00s
      Iter       Train Loss   Remaining Time 
         1           0.2232            0.37s
        

In [37]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(gd_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 89.61,
 'test_precision': 71.08,
 'test_recall': 60.74,
 'test_f1': 60.94}

### Log Gradient Boost

In [38]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(gd_clf,'Gradient Boost, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Cat Boost

In [39]:
cat_clf = CatBoostClassifier(loss_function='MultiLogloss',
                            eval_metric='HammingLoss',
                            verbose=1)
cat_clf.fit(x_train, y_train)

Learning rate set to 0.029123
0:	learn: 0.1101801	total: 267ms	remaining: 4m 26s
1:	learn: 0.1110388	total: 404ms	remaining: 3m 21s
2:	learn: 0.1110295	total: 539ms	remaining: 2m 59s
3:	learn: 0.1111404	total: 683ms	remaining: 2m 50s
4:	learn: 0.1111911	total: 830ms	remaining: 2m 45s
5:	learn: 0.1112004	total: 974ms	remaining: 2m 41s
6:	learn: 0.1111357	total: 1.13s	remaining: 2m 39s
7:	learn: 0.1111450	total: 1.32s	remaining: 2m 43s
8:	learn: 0.1112050	total: 1.47s	remaining: 2m 42s
9:	learn: 0.1111958	total: 1.62s	remaining: 2m 40s
10:	learn: 0.1111773	total: 1.77s	remaining: 2m 39s
11:	learn: 0.1111588	total: 1.92s	remaining: 2m 37s
12:	learn: 0.1111588	total: 2.06s	remaining: 2m 36s
13:	learn: 0.1111404	total: 2.19s	remaining: 2m 34s
14:	learn: 0.1111127	total: 2.32s	remaining: 2m 32s
15:	learn: 0.1111404	total: 2.44s	remaining: 2m 30s
16:	learn: 0.1111357	total: 2.57s	remaining: 2m 28s
17:	learn: 0.1111450	total: 2.7s	remaining: 2m 27s
18:	learn: 0.1111542	total: 2.83s	remaining: 

<catboost.core.CatBoostClassifier at 0x2a52795b790>

### Evaluate Cat Boost

In [40]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
cat_clf_scores = cross_validate(cat_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

Learning rate set to 0.026476
0:	learn: 0.1098107	total: 131ms	remaining: 2m 11s
1:	learn: 0.1106994	total: 260ms	remaining: 2m 9s
2:	learn: 0.1109245	total: 385ms	remaining: 2m 7s
3:	learn: 0.1113689	total: 484ms	remaining: 2m
4:	learn: 0.1118594	total: 613ms	remaining: 2m 1s
5:	learn: 0.1118536	total: 754ms	remaining: 2m 4s
6:	learn: 0.1117671	total: 891ms	remaining: 2m 6s
7:	learn: 0.1118190	total: 1.02s	remaining: 2m 6s
8:	learn: 0.1119171	total: 1.15s	remaining: 2m 6s
9:	learn: 0.1118306	total: 1.28s	remaining: 2m 6s
10:	learn: 0.1118363	total: 1.41s	remaining: 2m 6s
11:	learn: 0.1117382	total: 1.54s	remaining: 2m 6s
12:	learn: 0.1116921	total: 1.68s	remaining: 2m 7s
13:	learn: 0.1114843	total: 1.81s	remaining: 2m 7s
14:	learn: 0.1116170	total: 1.94s	remaining: 2m 7s
15:	learn: 0.1115362	total: 2.06s	remaining: 2m 7s
16:	learn: 0.1112823	total: 2.19s	remaining: 2m 6s
17:	learn: 0.1113689	total: 2.33s	remaining: 2m 6s
18:	learn: 0.1112246	total: 2.45s	remaining: 2m 6s
19:	learn: 0.

In [41]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(cat_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 90.48,
 'test_precision': 78.54,
 'test_recall': 65.63,
 'test_f1': 66.89}

### Log Cat Boost

In [42]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(cat_clf,'Cat Boost, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Hyperparameter Tuning

In [41]:
list(rf_clf.get_params().keys())

['memory',
 'steps',
 'verbose',
 'randomforestclassifier',
 'randomforestclassifier__bootstrap',
 'randomforestclassifier__ccp_alpha',
 'randomforestclassifier__class_weight',
 'randomforestclassifier__criterion',
 'randomforestclassifier__max_depth',
 'randomforestclassifier__max_features',
 'randomforestclassifier__max_leaf_nodes',
 'randomforestclassifier__max_samples',
 'randomforestclassifier__min_impurity_decrease',
 'randomforestclassifier__min_samples_leaf',
 'randomforestclassifier__min_samples_split',
 'randomforestclassifier__min_weight_fraction_leaf',
 'randomforestclassifier__n_estimators',
 'randomforestclassifier__n_jobs',
 'randomforestclassifier__oob_score',
 'randomforestclassifier__random_state',
 'randomforestclassifier__verbose',
 'randomforestclassifier__warm_start']

In [None]:

rf_param_grid = [
    {'randomforestclassifier__n_estimators':[80,90,100,110,120,130],
     'randomforestclassifier__max_depth':[30,35,40,45,50],
     'randomforestclassifier__min_samples_split':[2,3,4,5,6,7,8],
     'randomforestclassifier__min_samples_leaf':[1,3,5,7,9],
     'randomforestclassifier__class_weight':['balanced',None],
     'randomforestclassifier__max_features':['sqrt','log2'],
     'randomforestclassifier__verbose': [0],
    }]
    

rf_clf_grid_search = RandomizedSearchCV(rf_clf, rf_param_grid,cv= 3,n_iter=100, scoring=precision_score_multi_label, return_train_score=True,refit=True,verbose=2)

rf_clf_grid_search.fit(x_train, y_train)

In [None]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(rf_clf_grid_search.best_estimator_,'Random Forest, Tuned, multilabel, Data resampled')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Voting Classifier

In [43]:
log_clf = make_pipeline(StandardScaler(),
                    LogisticRegression(max_iter=1000))

gd_clf = GradientBoostingClassifier(n_estimators=100,
                                    max_depth=3,
                                    max_features='sqrt',
                                    random_state=42,
                                    verbose=0)

rf_clf = RandomForestClassifier(n_jobs=-1,
                                n_estimators = 100,
                                max_depth=50,
                                min_samples_split=2,
                                max_features='sqrt',
                                min_samples_leaf=1,
                                verbose=0,
                                random_state=42)

cat_clf = CatBoostClassifier(verbose=0)

voting_clf = MultiOutputClassifier(VotingClassifier(
    estimators=[
        # ('log', log_clf),
        ('rf', rf_clf),
        ('gd', gd_clf),
        ('cat', cat_clf)
    ] 
,voting = "soft"))


voting_clf.fit(x_train,y_train)

### Evaluate Voting Classifier

In [44]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
voting_clf_scores = cross_validate(voting_clf,x_train,y_train, cv=5, scoring =calculate_scores_multi_label)

In [45]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(voting_clf_scores[score].mean(), 2)
validation_scores

{'test_accuracy': 91.03,
 'test_precision': 82.38,
 'test_recall': 68.35,
 'test_f1': 70.04}

### Log voting classifier

In [46]:
# Log the model's dataset train and test indices
log_data(x_train,y_train,x_test,y_test)
# Log the model, model description
model = log_model(voting_clf,'Voting Classifier')
# Log the model's train and test scores
log_metrics(train_scores, validation_scores)
# track the model artifacts, validation scores with mlflow 
track_model(model,validation_scores)

## Precision/ Recall trade_off

## Error analysis

## Evaluate on test set

In [None]:
metrics=[accuracy_score, precision_score, recall_score, f1_score]
train_scores, mean_train_scores= calculate_metrics(gd_clf,x_train,y_train, metrics)

print(mean_train_scores)
train_scores.sort_values("precision_score")

## Retrieve Runs and best model

In [18]:
runs = mlflow.search_runs([exp.experiment_id])
runs[['run_id','tags.mlflow.runName','metrics.test_precision','metrics.test_recall','metrics.test_f1','metrics.test_accuracy']]

Unnamed: 0,run_id,tags.mlflow.runName,metrics.test_precision,metrics.test_recall,metrics.test_f1,metrics.test_accuracy
0,b6aaed9a2e3d498fae2f1f2e653ea3f1,Voting Classifier,82.38,68.35,70.04,91.03
1,7c9c703026bf4599a1511aac4ed069b4,"Cat Boost, multilabel, Data resampled",78.54,65.63,66.89,90.48
2,5e9a2bdf773e4eaba816db1031f65b83,"Gradient Boost, multilabel, Data resampled",71.08,60.74,60.94,89.61
3,dcdc4868f2544bd8860fb1fbdc057933,"Decision Tree, multilabel, Data resampled",66.54,76.8,66.7,87.78
4,465c94b53d7040648b0e701c656fd424,"Random Forest, multilabel, Data resampled",89.16,76.29,78.92,92.15
5,e84c07550d2549c6ad9a232fb24a5054,"Baseline model: Logistic Regression, multilabe...",67.04,61.54,61.12,89.21


In [19]:
best_run= runs.sort_values('metrics.test_precision',ascending=False).iloc[0]

In [20]:
best_run

run_id                                      465c94b53d7040648b0e701c656fd424
experiment_id                                             996200319116358272
status                                                              FINISHED
artifact_uri               file:///C:/Users/Ali/Desktop/DS Projects/Tech ...
start_time                                  2024-01-28 20:19:46.125000+00:00
end_time                                    2024-01-28 20:19:46.394000+00:00
metrics.test_accuracy                                                  92.15
metrics.test_recall                                                    76.29
metrics.test_f1                                                        78.92
metrics.test_precision                                                 89.16
tags.mlflow.user                                                         Ali
tags.mlflow.source.type                                                LOCAL
tags.mlflow.source.name    C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...

In [51]:
artifact_path = best_run["artifact_uri"].replace("file:///", "")

In [52]:
model_pkl = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_pkl, "rb") as f:
    model = pickle.load(f)

model['model_object']