In [1]:
import pandas as pd
import numpy as np
import mlflow
import pickle
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from mlflow.tracking import MlflowClient
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
MLFLOW_TRACKING_URI = '../models/mlruns'
EXPERIMENT_NAME = 'skills_jobs_matching'
EXPORT_MODEL_PATH = '../models/rf_model.pkl'
DF_PATH = '../data/processed/1_sampled_df.pkl'
LOG_PATH = '../models/logs'
LOG_DATA_PKL    =  "xgb_data.pkl"
LOG_MODEL_PKL   =  "xgb_model.pkl"
LOG_METRICS_PKL =  "xgb_metrics.pkl"

In [2]:
df = pd.read_pickle(DF_PATH)
X = df.iloc[:,:25]
y = df.iloc[:,25:]

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42)

In [4]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


### Random Forest Pipelining and Training

From EDA we noticed colinearity between features, in this pipeline we handle via PCA. But we first need to scale the features. 

In [15]:
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

xgb_pipeline = Pipeline(steps=[
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=0.95)), 
    ('classifier', XGBClassifier(
        n_estimators=500,
        learning_rate=0.01,
        n_jobs=-1, 
        random_state=42
        
    ))
])
xgb_pipeline.fit(X_train, y_train)

In [16]:
from sklearn.metrics import classification_report
y_train_pred = xgb_pipeline.predict(X_train)
training_report = str(classification_report(y_train, y_train_pred, target_names=y_train.columns, zero_division=0)).split('\n')
training_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.90      0.50      0.64      1624',
 '                     Data or business analyst       0.94      0.34      0.50      1382',
 'Data scientist or machine learning specialist       0.89      0.71      0.79      1781',
 '                       Database administrator       0.96      0.13      0.23      1252',
 '                            DevOps specialist       0.90      0.36      0.52      1525',
 '                        Developer, QA or test       1.00      0.10      0.18      1059',
 '                          Developer, back-end       0.86      0.48      0.62      4029',
 'Developer, desktop or enterprise applications       0.94      0.25      0.39      1831',
 '  Developer, embedded applications or devices       0.90      0.46      0.61      1251',
 '                         Developer, front-end       0.88      0.44      0.58      1

In [17]:
from sklearn.metrics import classification_report
y_test_pred = xgb_pipeline.predict(X_test)
testing_report = str(classification_report(y_test, y_test_pred, target_names=y_train.columns, zero_division=0)).split('\n')
testing_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.79      0.34      0.47       672',
 '                     Data or business analyst       0.67      0.17      0.27       582',
 'Data scientist or machine learning specialist       0.75      0.56      0.64       740',
 '                       Database administrator       0.72      0.02      0.05       524',
 '                            DevOps specialist       0.77      0.22      0.34       637',
 '                        Developer, QA or test       1.00      0.03      0.07       445',
 '                          Developer, back-end       0.66      0.32      0.43      1719',
 'Developer, desktop or enterprise applications       0.72      0.12      0.20       829',
 '  Developer, embedded applications or devices       0.69      0.23      0.35       518',
 '                         Developer, front-end       0.68      0.28      0.39       

We observe a strong sign of overfitting. Let's fine tune our model.

### Fine Tunining

In [18]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint


In [21]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform, randint

param_dist = {
    'classifier__n_estimators': randint(200, 500),
    'classifier__learning_rate': uniform(0.01, 0.3),  # Range: [0.01, 0.31]
    'classifier__max_depth': randint(3, 10),
    'classifier__subsample': uniform(0.6, 0.4),       # Range: [0.6, 1.0]
    'classifier__reg_alpha': uniform(0.0, 1.0),       # L1 regularization
    'pca__n_components': [0.80, 0.90],
}

# Set up random search
random_search = RandomizedSearchCV(
    estimator=xgb_pipeline,
    param_distributions=param_dist,
    n_iter=10,                  # Try 30 random combinations
    scoring='precision',         # or 'f1', 'roc_auc' depending on task
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit the search
random_search.fit(X_train, y_train)


Fitting 3 folds for each of 10 candidates, totalling 30 fits




In [22]:
random_search.best_params_

{'classifier__learning_rate': np.float64(0.12236203565420874),
 'classifier__max_depth': 7,
 'classifier__n_estimators': 470,
 'classifier__reg_alpha': np.float64(0.7319939418114051),
 'classifier__subsample': np.float64(0.8394633936788146),
 'pca__n_components': 0.8}

In [27]:
xgb_pipeline = random_search.best_estimator_
xgb_pipeline

In [28]:
y_train_pred = xgb_pipeline.predict(X_train)
training_report = str(classification_report(y_train, y_train_pred, target_names=y_train.columns, zero_division=0)).split('\n')
training_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.99      0.99      0.99      1624',
 '                     Data or business analyst       1.00      0.99      0.99      1382',
 'Data scientist or machine learning specialist       1.00      0.99      1.00      1781',
 '                       Database administrator       1.00      0.99      1.00      1252',
 '                            DevOps specialist       1.00      0.99      1.00      1525',
 '                        Developer, QA or test       1.00      0.99      0.99      1059',
 '                          Developer, back-end       1.00      0.99      1.00      4029',
 'Developer, desktop or enterprise applications       1.00      0.99      0.99      1831',
 '  Developer, embedded applications or devices       0.99      0.99      0.99      1251',
 '                         Developer, front-end       1.00      0.99      1.00      1

In [30]:
y_test_pred = xgb_pipeline.predict(X_test)
testing_report = str(classification_report(y_test, y_test_pred, target_names=y_train.columns, zero_division=0)).split('\n')
testing_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.82      0.56      0.66       672',
 '                     Data or business analyst       0.78      0.49      0.61       582',
 'Data scientist or machine learning specialist       0.80      0.68      0.73       740',
 '                       Database administrator       0.87      0.46      0.60       524',
 '                            DevOps specialist       0.82      0.38      0.52       637',
 '                        Developer, QA or test       0.96      0.58      0.72       445',
 '                          Developer, back-end       0.66      0.51      0.57      1719',
 'Developer, desktop or enterprise applications       0.74      0.36      0.48       829',
 '  Developer, embedded applications or devices       0.78      0.48      0.60       518',
 '                         Developer, front-end       0.68      0.42      0.52       

#### Export Model

In [31]:
with open(EXPORT_MODEL_PATH, 'wb') as handle:
    pickle.dump(xgb_pipeline, handle)


### Track with Mlflow


In [32]:
# Initialize client and experiment
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)


In [33]:
import os
data_details = {
    'data_path': DF_PATH,
    'training_indices': X_train.index.tolist(),
    'teseting_indices': X_test.index.tolist(),
    'features_names': X.columns.tolist(),
    'target_names': y.columns.tolist(),
}
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as lg_file:
    pickle.dump(data_details, lg_file)

In [34]:
model = {
    'description': "Xgboost",
    'model_object': xgb_pipeline,
    'model_detailes': str(xgb_pipeline),
}
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as lg_file:
    pickle.dump(model, lg_file)

In [35]:
metrics_details = {
    'training_details': training_report,
    'testing_details': testing_report
}
with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as lg_file:
    pickle.dump(metrics_details, lg_file)

In [36]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id):
    mlflow.log_artifact(EXPORT_MODEL_PATH)   
    mlflow.log_metric("precision",precision_score(y_test, y_test_pred, average='macro', zero_division=0) ) 


In [37]:
runs = mlflow.search_runs([exp.experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.precision,tags.mlflow.source.type,tags.mlflow.user,tags.mlflow.runName,tags.mlflow.source.name
0,6ddaf4004caf4b8281f56d2021842706,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 21:27:23.831000+00:00,2025-04-04 21:27:23.885000+00:00,0.806944,LOCAL,Abdelhakiem,skillful-yak-438,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...
1,d762556a043e4f3bb98e11eb849289bb,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 20:39:13.618000+00:00,2025-04-04 20:39:20.763000+00:00,0.888575,LOCAL,Abdelhakiem,mysterious-snake-200,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...
2,156da497b9d1457c960ed997f2cd6da8,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:49:38.308000+00:00,2025-04-04 18:49:38.344000+00:00,0.552371,LOCAL,Abdelhakiem,efficient-gnu-96,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...
3,26b1d56d10d5443a9305dad31070c567,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:43:57.322000+00:00,2025-04-04 18:43:57.483000+00:00,,LOCAL,Abdelhakiem,persistent-colt-336,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...
