In [3]:
import pandas as pd
import numpy as np
import mlflow
import pickle
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from mlflow.tracking import MlflowClient
from sklearn.metrics import precision_score

MLFLOW_TRACKING_URI = '../models/mlruns'
EXPERIMENT_NAME = 'skills_jobs_matching'
EXPORT_MODEL_PATH = '../models/baseline_model.pkl'
CLEANED_DF_PATH = '../data/processed/1_cleaned_df.pkl'
SAMPLED_DF_PATH = '../data/processed/1_sampled_df.pkl'


In [4]:
import os

os.makedirs(MLFLOW_TRACKING_URI, exist_ok=True)



In [5]:
df = pd.read_pickle(CLEANED_DF_PATH)

In [6]:
skills_columns = df.iloc[:,:24].columns.to_list()
jobs_columns = df.iloc[:,24:].columns.to_list()
display(
    skills_columns, 
    jobs_columns
)

['skills_group_0',
 'skills_group_1',
 'skills_group_10',
 'skills_group_11',
 'skills_group_12',
 'skills_group_13',
 'skills_group_14',
 'skills_group_15',
 'skills_group_16',
 'skills_group_17',
 'skills_group_18',
 'skills_group_19',
 'skills_group_2',
 'skills_group_20',
 'skills_group_21',
 'skills_group_22',
 'skills_group_23',
 'skills_group_3',
 'skills_group_4',
 'skills_group_5',
 'skills_group_6',
 'skills_group_7',
 'skills_group_8',
 'skills_group_9']

['Academic researcher',
 'Data or business analyst',
 'Data scientist or machine learning specialist',
 'Database administrator',
 'DevOps specialist',
 'Developer, QA or test',
 'Developer, back-end',
 'Developer, desktop or enterprise applications',
 'Developer, embedded applications or devices',
 'Developer, front-end',
 'Developer, full-stack',
 'Developer, game or graphics',
 'Developer, mobile',
 'Engineer, data',
 'Scientist',
 'System administrator']

In [7]:
X = df.loc[:,skills_columns]
y = df.loc[:,jobs_columns]

### Handling Imbalance

https://www.kaggle.com/code/rafjaa/resampling-strategies-for-imbalanced-datasets

In [8]:
y.sum().sort_values(ascending=False)

Developer, full-stack                            20655
Developer, back-end                              17084
Developer, front-end                              8932
Developer, desktop or enterprise applications     4845
Developer, mobile                                 4751
DevOps specialist                                 3056
Data scientist or machine learning specialist     2460
Developer, embedded applications or devices       2138
System administrator                              2069
Engineer, data                                    1941
Academic researcher                               1708
Data or business analyst                          1658
Database administrator                            1210
Developer, QA or test                             1135
Scientist                                         1046
Developer, game or graphics                        899
dtype: int64

In [9]:
samples_per_class = 1200
resampled_roles = []
for role in y.columns:
    role_df = df[df[role] == 1].copy()
    if len(role_df) < samples_per_class:
        # upsample the minority class
        role_df = role_df.sample(samples_per_class, replace=True, random_state=42)
    else:
        # downsample the majority class
        role_df = role_df.sample(samples_per_class, random_state=42)
    resampled_roles.append(role_df)


In [10]:
resampled_df = pd.concat(resampled_roles, axis=0)
resampled_df.loc[:,y.columns].sum().sort_values(ascending=False)

Developer, back-end                              5748
Developer, full-stack                            5599
Developer, desktop or enterprise applications    2660
Developer, front-end                             2590
Data scientist or machine learning specialist    2521
Academic researcher                              2296
DevOps specialist                                2162
Developer, mobile                                2134
System administrator                             2113
Engineer, data                                   2012
Data or business analyst                         1964
Scientist                                        1911
Database administrator                           1776
Developer, embedded applications or devices      1769
Developer, QA or test                            1504
Developer, game or graphics                      1443
dtype: int64

In [11]:
# saving resampled_df as pickle file
resampled_df.to_pickle(SAMPLED_DF_PATH)

---
# Training Baseline model: logistics Regression

#### 1.Splitting data

In [None]:
X_resampled = resampled_df.loc[:,X.columns]
y_resampled = resampled_df.loc[:,y.columns]

In [32]:

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size=0.3, random_state=42)


#### 2. Intializing MLflow

In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


In [56]:

lg_model = Pipeline(steps = [
    ('scaler', StandardScaler()),
    ('classifier', MultiOutputClassifier(LogisticRegression()))
]
)
lg_model.fit(X_train, y_train)


In [73]:
# calculate precision, recall, and f1-score
from sklearn.metrics import classification_report
y_train_pred = lg_model.predict(X_train)
training_report = str(classification_report(y_train, y_train_pred, target_names=y_train.columns, zero_division=0)).split('\n')
training_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.55      0.15      0.24      1624',
 '                     Data or business analyst       0.53      0.10      0.16      1382',
 'Data scientist or machine learning specialist       0.68      0.45      0.54      1781',
 '                       Database administrator       0.44      0.01      0.02      1252',
 '                            DevOps specialist       0.61      0.19      0.29      1525',
 '                        Developer, QA or test       0.00      0.00      0.00      1059',
 '                          Developer, back-end       0.57      0.26      0.36      4029',
 'Developer, desktop or enterprise applications       0.55      0.04      0.07      1831',
 '  Developer, embedded applications or devices       0.59      0.20      0.30      1251',
 '                         Developer, front-end       0.65      0.28      0.39      1

In [74]:
y_test_pred = lg_model.predict(X_test)
test_report = str(classification_report(y_test, y_pred, target_names=y_train.columns, zero_division=0)).split('\n')
test_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.55      0.14      0.22       672',
 '                     Data or business analyst       0.45      0.09      0.14       582',
 'Data scientist or machine learning specialist       0.65      0.46      0.54       740',
 '                       Database administrator       0.61      0.02      0.04       524',
 '                            DevOps specialist       0.66      0.18      0.28       637',
 '                        Developer, QA or test       0.00      0.00      0.00       445',
 '                          Developer, back-end       0.54      0.24      0.33      1719',
 'Developer, desktop or enterprise applications       0.50      0.03      0.06       829',
 '  Developer, embedded applications or devices       0.62      0.20      0.31       518',
 '                         Developer, front-end       0.59      0.27      0.37       

##### Export model

In [None]:
with open(EXPORT_MODEL_PATH, 'wb') as handle:
    pickle.dump(lg_model, handle)


### Track with Mlflow


In [64]:
# Initialize client and experiment
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)


2025/04/04 20:31:23 INFO mlflow.tracking.fluent: Experiment with name 'skills_jobs_matching' does not exist. Creating a new experiment.


In [66]:
LOG_PATH = '../models/logs'

In [67]:
# make directory LOG_PATH
os.makedirs(LOG_PATH, exist_ok=True)

In [68]:
LOG_DATA_PKL = 'data.pkl'
LOG_MODEL_PKL = 'model.pkl'
LOG_METRICS_PKL = 'metrics.pkl'

In [70]:
data_details = {
    'data_path': CLEANED_DF_PATH,
    'training_indices': X_train.index.tolist(),
    'teseting_indices': X_test.index.tolist(),
    'features_names': X.columns.tolist(),
    'target_names': y.columns.tolist(),
}
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as lg_file:
    pickle.dump(data_details, lg_file)

In [72]:
model = {
    'description': "Baseline model , Regression Model",
    'model_object': lg_model,
    'model_detailes': str(lg_model)
}
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as lg_file:
    pickle.dump(model, lg_file)

In [75]:
metrics_details = {
    'training_details': training_report,
    'testing_details': test_report
}
with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as lg_file:
    pickle.dump(metrics_details, lg_file)

In [83]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id):
    mlflow.log_artifact(EXPORT_MODEL_PATH)   
    mlflow.log_metric("precision",precision_score(y_test, y_test_pred, average='macro', zero_division=0) ) 


In [84]:
runs = mlflow.search_runs([exp.experiment_id])


In [85]:
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.precision,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.source.type,tags.mlflow.user
0,156da497b9d1457c960ed997f2cd6da8,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:49:38.308000+00:00,2025-04-04 18:49:38.344000+00:00,0.552371,efficient-gnu-96,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,LOCAL,Abdelhakiem
1,26b1d56d10d5443a9305dad31070c567,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:43:57.322000+00:00,2025-04-04 18:43:57.483000+00:00,,persistent-colt-336,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,LOCAL,Abdelhakiem


In [None]:
best_run = runs.sort_values('metrics.precision').iloc[0]
best_run

In [None]:
# # Load model
# artifact_path = best_run["artifact_uri"].replace("file://", "")
# model_pkl = os.path.join(artifact_path, MODEL_PATH)
# with open(model_pkl, "rb") as f:
#     model = pickle.load(f)
    
# model
