In [1]:
# Constants
DATA_PATH   = "../Data/Processed/3_engineered_df.pkl"

TECH_JOBS = ['Techjobs']

CORE_COLS = ['VersionControlSystem',
             'Languages',
             'Databases',
             'Platforms',
             'WebFrameworks',
             'MiscTech',
             'ToolsTech',
             'CollabTools'
]


MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = "tech_jobs_predictions"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL    =  "data.pkl"
LOG_MODEL_PKL   =  "model.pkl"
LOG_METRICS_PKL =  "metrics.pkl"

In [2]:
# Load packages
import pandas as pd 
import numpy as np
import logging
import pickle
from pathlib import Path
import os

import mlflow
from mlflow.tracking import MlflowClient


from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


from sklearn import metrics
from sklearn.metrics import auc, accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score, cross_validate, GridSearchCV

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

_____

### Functions

In [3]:
def calculate_quality(ground_truth, predictions, metric_function, sort_values=False):
    quality_scores = {}
    for col in predictions.columns:
        role_pred  = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        if metric_function == accuracy_score:
            quality_scores[col] = round(metric_function(role_truth, role_pred) * 100, 2)
        else: 
            quality_scores[col] = round(metric_function(role_truth, role_pred,zero_division=0) * 100, 2)
        
    quality_scores = pd.Series(quality_scores.values(), index=quality_scores.keys())
    if sort_values:
        quality_scores = quality_scores.sort_values()
    
    return quality_scores

In [4]:
def calculate_scores(clf, x, y):
    y_pred = clf.predict(x)
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='macro',zero_division=0.0)
    recall = recall_score(y, y_pred, average='macro',zero_division=0.0)
    f1 = f1_score(y, y_pred, average='macro',zero_division=0.0)
    
    return {'accuracy': accuracy,
            'precision':precision,
            'recall': recall,
           'f1' : f1}

_____

In [5]:
# Load dataset and make a copy
eng_df = pd.read_pickle(DATA_PATH)
df = eng_df.copy()

In [6]:
df

Unnamed: 0_level_0,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,Techjobs,VersionControlSystem,VersionControlSystem,VersionControlSystem,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Languages,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Databases,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,Platforms,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,WebFrameworks,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,MiscTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,ToolsTech,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,CollabTools,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters,Skills_Clusters
Unnamed: 0_level_1,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher,Git,Mercurial,SVN,APL,Assembly,Bash/Shell,C,C#,C++,COBOL,Clojure,Crystal,Dart,Delphi,Elixir,Erlang,F#,Fortran,Go,Groovy,HTML/CSS,Haskell,Java,JavaScript,Julia,Kotlin,LISP,Lua,MATLAB,OCaml,Objective-C,PHP,Perl,PowerShell,Python,R,Ruby,Rust,SAS,SQL,Scala,Solidity,Swift,TypeScript,VBA,Cassandra,Cloud Firestore,CouchDB,Couchbase,DynamoDB,Elasticsearch,Firebase Realtime Database,IBM DB2,MariaDB,Microsoft SQL Server,MongoDB,MySQL,Neo4j,Oracle,PostgreSQL,Redis,SQLite,AWS,Colocation,DigitalOcean,Firebase,Google Cloud,Heroku,IBM Cloud or Watson,Linode,Managed Hosting,Microsoft Azure,OVH,OpenStack,Oracle Cloud Infrastructure,VMware,ASP.NET,ASP.NET Core,Angular,Angular.js,Blazor,Deno,Django,Drupal,Express,FastAPI,Fastify,Flask,Gatsby,Laravel,Next.js,Node.js,Nuxt.js,Phoenix,Play Framework,React.js,Ruby on Rails,Svelte,Symfony,Vue.js,jQuery,.NET,Apache Kafka,Apache Spark,Capacitor,Cordova,Electron,Flutter,GTK,Hadoop,Hugging Face Transformers,Ionic,Keras,NumPy,Pandas,Qt,React Native,Scikit-learn,Spring,TensorFlow,Tidyverse,Torch/PyTorch,Uno Platform,Xamarin,Ansible,Chef,Docker,Flow,Homebrew,Kubernetes,Pulumi,Puppet,Terraform,Unity 3D,Unreal Engine,Yarn,npm,Android Studio,Atom,CLion,Eclipse,Emacs,GoLand,IPython/Jupyter,IntelliJ,Nano,Neovim,NetBeans,Notepad++,PhpStorm,PyCharm,Qt Creator,"RAD Studio (Delphi, C++ Builder)",RStudio,Rider,RubyMine,Spyder,Sublime Text,TextMate,Vim,Visual Studio,Visual Studio Code,Webstorm,Xcode,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,skills_group_18,skills_group_19,skills_group_2,skills_group_20,skills_group_21,skills_group_22,skills_group_23,skills_group_24,skills_group_25,skills_group_26,skills_group_27,skills_group_28,skills_group_29,skills_group_3,skills_group_30,skills_group_31,skills_group_32,skills_group_33,skills_group_34,skills_group_35,skills_group_36,skills_group_37,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,3,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,2,0,0,0,3,2,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,3,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,3,0,0,0,0
9,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,1,2,0,1,0,0,0,0,0,0,0,2,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
10,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,4,0,0,3,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,7,4,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,3,1,1,2,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,3,0,0,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,3,0,3,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,1,2,0,4,1,0,3,0,1,0,0,0,0,0,0,0,2,2,0,1,0,2,0,0,0,0,0,2,0,3,0,1,0,1,0,2,0,0
73264,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,9,0,1,4,0,0,1,0,0,0,0,0,1,0,0,0,0,3,0,0,0,1,0,0,0,0,0,2,0,1,0,0,0,2,0,1,0,0
73265,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,1,1,1,0,0,0,1,1,1,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0,2,4,0,5,0,0,1,0,0,0,1,2,0,1,0,0,0,1,1,1,0,1,0,2,0,1,0,0,0,1,0,0,7,3,0,1,0,0


### Deal with Imbalanced Dataset

### Create a Test Set

In [7]:
X = df.drop(df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, 
                                                    y,test_size=0.20,
                                                    random_state=42)

### Intialize MLflow

In [9]:
# Create Directories
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [10]:
# Initialize client and experiment
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)

### Create a Baseline Model

In [15]:
log_clf = make_pipeline(StandardScaler(),
                    MultiOutputClassifier(LogisticRegression(max_iter=1000)))
log_clf.fit(x_train, y_train.values)

In [16]:
predictions =  pd.DataFrame(log_clf.predict(x_train),
                            columns=y_train.columns)

In [17]:
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

In [18]:
print(mean_train_scores)
train_scores

accuracy_score     92.404737
precision_score    59.281579
recall_score       22.577368
f1_score           29.498421
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Data scientist or machine learning specialist,96.28,68.27,44.71,54.03
"Engineer, data",95.56,56.5,13.29,21.52
Data or business analyst,96.6,59.35,12.54,20.7
"Developer, back-end",69.24,65.89,51.61,57.88
Database administrator,96.24,45.0,0.65,1.27
"Developer, mobile",93.98,77.65,55.58,64.79
"Developer, full-stack",72.77,69.01,70.52,69.76
Cloud infrastructure engineer,94.06,57.29,14.41,23.03
"Developer, embedded applications or devices",95.77,65.41,22.71,33.71
"Developer, QA or test",96.92,0.0,0.0,0.0


In [104]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
log_clf_scores = cross_validate(log_clf,x_train,y_train, cv=4, scoring =calculate_scores)

In [105]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(log_clf_scores[score].mean()* 100, 2)
validation_scores

{'test_accuracy': 20.05,
 'test_precision': 53.14,
 'test_recall': 21.86,
 'test_f1': 28.35}

### Log Baseline Model

In [94]:
# Save the model's dataset trained on
data_details = {"data_path": DATA_PATH,
                "training_indices": x_train.index.tolist(),
                "test_indices":     x_test.index.tolist(), 
                "features_names":   x_train.columns.tolist(),
                "targets_names":    y_train.columns.tolist()}

with open(os.path.join(LOG_PATH, LOG_DATA_PKL), "wb") as output_file:
    pickle.dump(data_details, output_file)

In [95]:
# save the model, model details and model's description
model = {"model_description": "Baseline model: Logistic Regression ",
         "model_details": str(log_clf),
         "model_object": log_clf} 

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [97]:
# save the model metrics
classes_metrics = {"train_scores": train_scores,
                    "validation_scores" : validation_scores} 


with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

### Random Forest

In [11]:
rf_clf = make_pipeline(
                       # PCA(n_components=0.95),
                       RandomForestClassifier(n_jobs=-1,
                                              verbose=1,
                                              random_state=42))

rf_clf.fit(x_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   10.8s finished


In [12]:
# Evaluate on train set
predictions =  pd.DataFrame(rf_clf.predict(x_train),
                            columns=y_train.columns)
train_scores = {score.__name__: calculate_quality(y_train, predictions, score) 
                for score in [accuracy_score, precision_score, recall_score, f1_score]}
train_scores = pd.concat(train_scores,axis=1)
mean_train_scores = train_scores.mean()

[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    1.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    3.2s finished


In [13]:
print(train_scores.mean())
train_scores.sort_values("precision_score")

accuracy_score     99.991579
precision_score    99.988947
recall_score       99.845789
f1_score           99.916842
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
"Developer, embedded applications or devices",99.98,99.89,99.72,99.8
"Developer, desktop or enterprise applications",99.98,99.93,99.87,99.9
"Developer, back-end",99.98,99.99,99.96,99.97
"Developer, front-end",99.99,99.99,99.96,99.98
"Developer, full-stack",99.99,99.99,99.99,99.99
Data scientist or machine learning specialist,100.0,100.0,100.0,100.0
Blockchain,99.99,100.0,99.66,99.83
"Developer, game or graphics",99.99,100.0,99.5,99.75
Security professional,99.99,100.0,99.38,99.69
Scientist,99.99,100.0,99.78,99.89


In [14]:
# cross_validate the baseline model on the accuracy, precision, recall and f1
rf_clf_scores = cross_validate(rf_clf,x_train,y_train, cv=4, scoring =calculate_scores,verbose=1)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   12.1s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    9.7s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.2s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Pa

In [16]:
scores = ['test_accuracy', 'test_precision','test_recall', 'test_f1']
validation_scores = {}
for score in scores:
    validation_scores[score] = round(rf_clf_scores[score].mean()* 100, 2)
validation_scores

{'test_accuracy': 24.3,
 'test_precision': 53.42,
 'test_recall': 14.46,
 'test_f1': 19.11}

### Log RandomForest Model

In [17]:
# Model
model = {"model_description": "Random Forest",
         "model_details": str(rf_clf),
         "model_object": rf_clf}

with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), "wb") as output_file:
    pickle.dump(model, output_file)

In [18]:
# Performance details
classes_metrics = {"train_scores": train_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

In [19]:
# Start a new run in the experiment and track 
with mlflow.start_run(experiment_id=exp.experiment_id, run_name=model["model_description"]):
    # Track pickle files
    mlflow.log_artifacts(LOG_PATH)
    
    # Track metrics 
    for metric, score in validation_scores.items():
        mlflow.log_metric(metric, score)

In [20]:
runs = mlflow.search_runs([exp.experiment_id])
runs 

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.test_f1,metrics.test_accuracy,metrics.test_precision,metrics.test_recall,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.type,tags.mlflow.runName
0,897ce09f88ba4e79aad3ca5e770aeed7,615117982098744657,FINISHED,file:///C:/Users/Ali/Desktop/DS Projects/Tech ...,2024-01-21 22:06:08.904000+00:00,2024-01-21 22:06:10.076000+00:00,19.11,24.3,53.42,14.46,C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...,Ali,LOCAL,Random Forest
1,98d66457f6234f23ab3a232fe1d3c132,615117982098744657,FINISHED,file:///C:/Users/Ali/Desktop/DS Projects/Tech ...,2024-01-21 21:57:42.633000+00:00,2024-01-21 21:57:42.708000+00:00,28.35,20.05,53.14,21.86,C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...,Ali,LOCAL,Baseline model: Logistic Regression


### Playground

In [7]:
one_job_df = eng_df[jobs_df.sum(axis=1) ==1]

In [8]:
len(one_job_df)

23154

In [9]:
one_job_df[TECH_JOBS].sum(axis =0).sort_values()

Techjobs  Blockchain                                         56
          Database administrator                             62
          Scientist                                         127
          Security professional                             146
          Cloud infrastructure engineer                     170
          Developer, game or graphics                       215
          System administrator                              215
          DevOps specialist                                 278
          Data or business analyst                          315
          Developer, QA or test                             351
          Engineer, data                                    370
          Academic researcher                               440
          Data scientist or machine learning specialist     582
          Developer, embedded applications or devices       659
          Developer, desktop or enterprise applications     869
          Developer, mobile             

In [None]:
# apply smote to 1 job df
x_train = one_job_df.drop(one_job_df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y_train = one_job_df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [None]:
#convert labels to array or series because smote doesn't accept dataframe as labels
y_train_1 = y_train[(y_train['Developer, mobile'] == 1) | (y_train['Developer, front-end'] == 1)|(y_train['Developer, back-end'] == 1)|(y_train['Developer, full-stack'] == 1)]
y_train_2 = y_train[~((y_train['Developer, mobile'] == 1) | (y_train['Developer, front-end'] == 1)|(y_train['Developer, back-end'] == 1)|(y_train['Developer, full-stack'] == 1))]

In [None]:
y_train_1_array = y_train_1.to_numpy()
y_train_2_array = y_train_2.to_numpy()

In [None]:
y_train_array = y_train.to_numpy()

In [None]:
y_train

Unnamed: 0,Data scientist or machine learning specialist,"Engineer, data",Data or business analyst,"Developer, back-end",Database administrator,"Developer, mobile","Developer, full-stack",Cloud infrastructure engineer,"Developer, embedded applications or devices","Developer, QA or test",System administrator,Scientist,Security professional,"Developer, game or graphics","Developer, front-end",Blockchain,"Developer, desktop or enterprise applications",DevOps specialist,Academic researcher
3,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
12,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
15,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
17,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73260,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
73261,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
73262,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
73263,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
over = SMOTE(random_state=42, k_neighbors=5)
under= RandomUnderSampler(random_state=42,sampling_strategy={3:1000,5:1000,6:1000,14:1000})
x_under, y_under= under.fit_resample(x_train, y_train_array)
x_over, y_over = over.fit_resample(x_under, y_under)

In [None]:
y_train_under= pd.DataFrame(y_under, index= x_under.index,columns = y_train.columns)

In [None]:
y_train_under.sum(axis=0).sort_values()

Blockchain                                         56
Database administrator                             62
Scientist                                         127
Security professional                             146
Cloud infrastructure engineer                     170
Developer, game or graphics                       215
System administrator                              215
DevOps specialist                                 278
Data or business analyst                          315
Developer, QA or test                             351
Engineer, data                                    370
Academic researcher                               440
Data scientist or machine learning specialist     582
Developer, embedded applications or devices       659
Developer, desktop or enterprise applications     869
Developer, full-stack                            1000
Developer, mobile                                1000
Developer, front-end                             1000
Developer, back-end         

In [None]:
y_train_over = pd.DataFrame(y_over, index= x_over.index,columns = y_train.columns)

In [None]:
y_train_over.sum(axis = 0)

Data scientist or machine learning specialist    1000
Engineer, data                                   1000
Data or business analyst                         1000
Developer, back-end                              1000
Database administrator                           1000
Developer, mobile                                1000
Developer, full-stack                            1000
Cloud infrastructure engineer                    1000
Developer, embedded applications or devices      1000
Developer, QA or test                            1000
System administrator                             1000
Scientist                                        1000
Security professional                            1000
Developer, game or graphics                      1000
Developer, front-end                             1000
Blockchain                                       1000
Developer, desktop or enterprise applications    1000
DevOps specialist                                1000
Academic researcher         

In [None]:
len(y_train_over)

19000

In [33]:
from sklearn.model_selection import StratifiedKFold

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


for train_index, test_index in skf.split(X, y):
    x_train, x_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

ValueError: Supported target types are: ('binary', 'multiclass'). Got 'multilabel-indicator' instead.

##### play 2

In [45]:
# DownSample majority classes and OverSample minority Classes
samples_per_class = 1000
resampled_jobs = []

for job in jobs_df.columns:
    sub_df = jobs_df.loc[jobs_df[job] == 1].copy()
    
    if len(sub_df) < samples_per_class:
        # Upsample
        sub_df = sub_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # Downsample
        sub_df = sub_df.sample(samples_per_class, random_state=42) 
    
    resampled_jobs.append(sub_df)

In [46]:
jobs_df = pd.concat(resampled_jobs)
jobs_df.sum(axis=0).sort_values()

Blockchain                                       1277
Developer, game or graphics                      1331
Security professional                            1363
Developer, QA or test                            1416
Developer, embedded applications or devices      1725
Scientist                                        1753
Data or business analyst                         1859
Database administrator                           1928
Engineer, data                                   2011
Developer, mobile                                2111
Academic researcher                              2162
System administrator                             2168
Data scientist or machine learning specialist    2240
Cloud infrastructure engineer                    2317
DevOps specialist                                2531
Developer, desktop or enterprise applications    3021
Developer, front-end                             3290
Developer, full-stack                            6509
Developer, back-end         

In [47]:
len(jobs_df)

19000

In [48]:
jobs_df[jobs_df.sum(axis = 1) ==1].sum(axis=0).sort_values()

Database administrator                            42
Cloud infrastructure engineer                     50
Blockchain                                        78
DevOps specialist                                 82
System administrator                              92
Scientist                                        115
Developer, desktop or enterprise applications    158
Security professional                            183
Engineer, data                                   186
Data or business analyst                         195
Developer, game or graphics                      212
Developer, QA or test                            239
Developer, front-end                             250
Data scientist or machine learning specialist    250
Academic researcher                              254
Developer, embedded applications or devices      305
Developer, back-end                              325
Developer, mobile                                357
Developer, full-stack                         

In [49]:
jobs_df =df['Techjobs'].copy()
jobs_df.sum(axis=0).sort_values()

Blockchain                                         736
Security professional                              801
Developer, game or graphics                       1009
Scientist                                         1122
Developer, QA or test                             1425
Data or business analyst                          1666
Database administrator                            1755
Academic researcher                               1819
Engineer, data                                    2091
System administrator                              2099
Developer, embedded applications or devices       2185
Data scientist or machine learning specialist     2268
Cloud infrastructure engineer                     2813
DevOps specialist                                 3350
Developer, mobile                                 4613
Developer, desktop or enterprise applications     5734
Developer, front-end                             10172
Developer, back-end                              19001
Developer,

In [43]:
len(jobs_df)

46408

In [44]:
jobs_df[jobs_df.sum(axis = 1) ==1].sum(axis=0).sort_values()

Blockchain                                         56
Database administrator                             62
Scientist                                         127
Security professional                             146
Cloud infrastructure engineer                     170
Developer, game or graphics                       215
System administrator                              215
DevOps specialist                                 278
Data or business analyst                          315
Developer, QA or test                             351
Engineer, data                                    370
Academic researcher                               440
Data scientist or machine learning specialist     582
Developer, embedded applications or devices       659
Developer, desktop or enterprise applications     869
Developer, mobile                                1633
Developer, front-end                             2818
Developer, back-end                              5918
Developer, full-stack       

In [30]:
balanced_df = df.loc[jobs_df.index].copy()

In [32]:
X = balanced_df.drop(balanced_df[TECH_JOBS],axis = 1).droplevel(0,axis=1).copy()
y = balanced_df[TECH_JOBS].droplevel(0,axis = 1).copy()

In [38]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    y,test_size=0.20,
                                                    random_state=42)

In [39]:
len(X_train)

21280

#### Retrieve best model

In [42]:
best_run = runs.iloc[1]

In [43]:
best_run

run_id                                      3a5f0d8dc28143a7b2f6721048fd4d6e
experiment_id                                             615117982098744657
status                                                              FINISHED
artifact_uri               file:///C:/Users/Ali/Desktop/DS Projects/Tech ...
start_time                                  2024-01-21 12:56:11.753000+00:00
end_time                                    2024-01-21 12:56:11.881000+00:00
metrics.accuracy_score                                             92.404737
metrics.precision_score                                            59.281579
metrics.recall_score                                               22.577368
metrics.f1_score                                                   29.498421
tags.mlflow.runName                     Baseline model: Logistic Regression 
tags.mlflow.source.name    C:\Users\Ali\mambaforge-pypy3\envs\env1\Lib\si...
tags.mlflow.source.type                                                LOCAL

In [52]:
artifact_path = best_run["artifact_uri"].replace("file:///", "")

In [57]:
model_pkl = os.path.join(artifact_path, LOG_MODEL_PKL)
with open(model_pkl, "rb") as f:
    model = pickle.load(f)

model['model_object']