In [1]:
DF_PATH = "../data/processed/cleaned_data.pkl"

ROLE_COLS      = ['DevType']
TECH_COLS      = ['LanguageHaveWorkedWith',
                  'DatabaseHaveWorkedWith',
                  'PlatformHaveWorkedWith',
                  'WebframeHaveWorkedWith',
                  'MiscTechHaveWorkedWith',
                  'ToolsTechHaveWorkedWith',
                  'NEWCollabToolsHaveWorkedWith']

MLFLOW_TRACKING_URI = "../models/mlruns"
MLFLOW_EXPERIMENT_NAME = "skills_jobs_stackoverflow"

LOG_PATH = "../models/temp/"
LOG_DATA_PKL = "data.pkl"
LOG_MODEL_PKL = "model.pkl"
LOG_METRICS_PKL = "metrics.pkl"

In [2]:
import pandas as pd

In [3]:
def calc_quality(ground_truth , predictions , metric_func , sort_values = False):
    quality_scores ={}
    for col in predictions.columns:
        role_pred = predictions[col].copy()
        role_truth = ground_truth[col].copy()
        quality_scores[col] = round(metric_func(role_truth , role_pred)*100 , 2)

    quality_scores = pd.Series(quality_scores.values() , index = quality_scores.keys())

    if sort_values:
        quality_scores = quality_scores.sort_values()

    return quality_scores


In [4]:
#create directories
from pathlib import Path
Path(MLFLOW_TRACKING_URI).mkdir(parents=True , exist_ok=True)
Path(LOG_PATH).mkdir(parents=True , exist_ok=True)

In [5]:
df = pd.read_pickle(DF_PATH)
df

Unnamed: 0_level_0,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,DevType,...,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters,skills_clusters
Unnamed: 0_level_1,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end",...,skills_group_18,skills_group_19,skills_group_2,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,4,2,0
3,0,0,0,0,0,0,0,0,0,1,...,2,0,0,0,0,0,0,0,2,0
4,0,0,0,0,0,0,0,1,0,0,...,0,0,0,3,1,2,3,0,1,0
8,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,3,0,2,0
9,0,0,1,0,0,0,0,0,0,0,...,0,0,0,6,0,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83434,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,3,1,0,0,1,1
83435,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,4,2,2,4,2,1
83436,0,0,1,0,1,0,0,0,0,0,...,0,0,0,4,5,1,4,0,1,0
83437,0,0,0,0,0,0,1,0,0,0,...,0,0,0,4,1,0,4,0,1,0


### Balance classes

In [6]:
roles_df = df['DevType'].copy()
roles_df.sum(axis=0)

Academic researcher                               1708
Data or business analyst                          1658
Data scientist or machine learning specialist     2460
Database administrator                            1210
DevOps specialist                                 3056
Developer, QA or test                             1135
Developer, back-end                              17084
Developer, desktop or enterprise applications     4845
Developer, embedded applications or devices       2138
Developer, front-end                              8932
Developer, full-stack                            20655
Developer, game or graphics                        899
Developer, mobile                                 4751
Engineer, data                                    1941
Scientist                                         1046
System administrator                              2069
dtype: int64

In [7]:
#Resample roles
samples_per_class = 1200
resampled_roles = []

for role_col in roles_df.columns:
    sub_df = roles_df.loc[roles_df[role_col] == 1].copy()

    if len(sub_df) < samples_per_class:
        #Upsample
        sub_df = sub_df.sample(samples_per_class , replace= True , random_state =0)
    
    else:
        #downsample
        sub_df = sub_df.sample(samples_per_class , random_state =0)

    resampled_roles.append(sub_df)

In [8]:
roles_df = pd.concat(resampled_roles)
df = df.loc[roles_df.index].copy()

In [9]:
roles_df.sum(axis=0)

Academic researcher                              2280
Data or business analyst                         1965
Data scientist or machine learning specialist    2576
Database administrator                           1765
DevOps specialist                                2170
Developer, QA or test                            1514
Developer, back-end                              5710
Developer, desktop or enterprise applications    2690
Developer, embedded applications or devices      1773
Developer, front-end                             2614
Developer, full-stack                            5602
Developer, game or graphics                      1441
Developer, mobile                                2155
Engineer, data                                   2046
Scientist                                        1910
System administrator                             2110
dtype: int64

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),df["DevType"],random_state=0)

  X_train, X_test, Y_train, Y_test = train_test_split(df.drop("DevType", axis=1),df["DevType"],random_state=0)


### Train models

In [11]:
#intialize mlflow client and experiment
import mlflow
from mlflow.tracking import MlflowClient
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)
exp = client.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)


  return FileStore(store_uri, store_uri)
2025/12/12 01:18:33 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_stackoverflow' does not exist. Creating a new experiment.


### Logistic Regression

In [12]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline, FeatureUnion
clf = make_pipeline(StandardScaler(),MultiOutputClassifier(LogisticRegression()))

clf.fit(X_train.values , Y_train.values)


In [13]:
#Evaluate on training set
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score
predictions = pd.DataFrame(clf.predict(X_train.values) , columns= Y_train.columns)
train_scores = {score.__name__:calc_quality(Y_train , predictions , score) for score in [accuracy_score,  precision_score, recall_score ,f1_score]}
train_scores = pd.concat(train_scores , axis =1)
print(train_scores.mean())
train_scores

accuracy_score     89.066250
precision_score    65.106250
recall_score       30.365000
f1_score           39.291875
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,89.25,61.4,27.49,37.98
Data or business analyst,90.69,62.96,22.22,32.85
Data scientist or machine learning specialist,90.6,71.1,50.16,58.82
Database administrator,90.94,56.16,6.2,11.16
DevOps specialist,90.44,67.02,31.3,42.67
"Developer, QA or test",92.24,56.25,0.8,1.59
"Developer, back-end",74.33,61.8,35.35,44.98
"Developer, desktop or enterprise applications",86.88,62.98,18.61,28.73
"Developer, embedded applications or devices",92.17,66.29,34.34,45.24
"Developer, front-end",88.97,65.81,36.14,46.66


In [14]:
#Evaluate on test set
predictions = pd.DataFrame(clf.predict(X_test.values) , columns= Y_test.columns)
test_scores = {score.__name__:calc_quality(Y_test , predictions , score) for score in [accuracy_score,  precision_score, recall_score ,f1_score]}
test_scores = pd.concat(test_scores , axis =1)
print(test_scores.mean())
test_scores

accuracy_score     88.937500
precision_score    62.178125
recall_score       29.886875
f1_score           38.608125
dtype: float64


Unnamed: 0,accuracy_score,precision_score,recall_score,f1_score
Academic researcher,90.06,64.91,30.94,41.9
Data or business analyst,91.12,65.99,26.58,37.9
Data scientist or machine learning specialist,89.96,67.58,49.23,56.96
Database administrator,90.9,56.1,5.2,9.52
DevOps specialist,90.23,65.55,25.66,36.88
"Developer, QA or test",91.69,25.0,0.51,0.99
"Developer, back-end",74.4,62.64,35.72,45.5
"Developer, desktop or enterprise applications",87.52,61.58,18.2,28.09
"Developer, embedded applications or devices",92.62,64.62,32.93,43.63
"Developer, front-end",88.4,67.91,36.76,47.7


### Log run

### 1-Prepare

In [15]:
import os
import pickle
data_details = {
    "data_path" : DF_PATH,
    "training_indicies" : X_train.index.tolist(),
    "test_indicies" : X_test.index.tolist(),
    "feature_names" :  X_train.columns.droplevel(0).tolist(),
    "target_names": Y_train.columns.tolist()
}
with open (os.path.join(LOG_PATH , LOG_DATA_PKL) ,'wb') as output_file:
    pickle.dump(data_details , output_file)

In [16]:
model = {
    "model_description": "Baseline model: Logistic Regression ",
    "model_details": str(clf),
    "model_object": clf
}

with open (os.path.join(LOG_PATH , LOG_MODEL_PKL) ,'wb') as output_file:
    pickle.dump(model , output_file)

In [17]:
classes_metrics = {"train_scores": train_scores, 
                   "test_scores":  test_scores}

with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), "wb") as output_file:
    pickle.dump(classes_metrics, output_file)

### Log

In [18]:
#Start a new run and track
with mlflow.start_run(experiment_id=exp.experiment_id , run_name= model['model_description']):
    mlflow.log_artifacts(LOG_PATH)

    for metric , score in test_scores.mean().items():
        mlflow.log_metric(metric , score)