In [1]:
import pandas as pd
import numpy as np
import mlflow
import pickle
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from mlflow.tracking import MlflowClient
from sklearn.metrics import precision_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler
MLFLOW_TRACKING_URI = '../models/mlruns'
EXPERIMENT_NAME = 'skills_jobs_matching'
EXPORT_MODEL_PATH = '../models/rf_model.pkl'
DF_PATH = '../data/processed/1_sampled_df.pkl'
LOG_PATH = '../models/temp'
LOG_DATA_PKL    =  "rf_data.pkl"
LOG_MODEL_PKL   =  "rf_model.pkl"
LOG_METRICS_PKL =  "rf_metrics.pkl"

In [2]:
import os
os.makedirs(LOG_PATH, exist_ok=True)

In [3]:
df = pd.read_pickle(DF_PATH)
X = df.iloc[:,:24]
y = df.iloc[:,24:]

In [4]:
X.head()

Unnamed: 0,skills_group_0,skills_group_1,skills_group_10,skills_group_11,skills_group_12,skills_group_13,skills_group_14,skills_group_15,skills_group_16,skills_group_17,...,skills_group_21,skills_group_22,skills_group_23,skills_group_3,skills_group_4,skills_group_5,skills_group_6,skills_group_7,skills_group_8,skills_group_9
25185,0,0,0,0,0,0,0,0,1,1,...,0,0,0,1,0,1,0,0,1,0
67789,3,0,0,0,0,0,0,0,3,2,...,0,0,0,4,0,0,1,0,0,0
68754,1,0,0,0,0,0,0,1,0,0,...,0,0,0,3,0,1,0,0,1,0
4402,1,0,0,0,0,0,0,0,2,1,...,0,1,0,6,1,0,1,0,1,0
19499,0,1,3,1,0,4,1,0,4,4,...,1,1,0,1,0,2,0,1,0,0


In [5]:
y.head()

Unnamed: 0,Academic researcher,Data or business analyst,Data scientist or machine learning specialist,Database administrator,DevOps specialist,"Developer, QA or test","Developer, back-end","Developer, desktop or enterprise applications","Developer, embedded applications or devices","Developer, front-end","Developer, full-stack","Developer, game or graphics","Developer, mobile","Engineer, data",Scientist,System administrator
25185,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
67789,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
68754,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4402,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0
19499,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42)

In [7]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)


### Random Forest Pipelining and Training

From EDA we noticed colinearity between features, in this pipeline we handle via PCA. But we first need to scale the features. 

In [8]:
from sklearn.decomposition import PCA
rf_pipeline = Pipeline(steps=[
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=0.95)),  # retain 95% variance
    ('classifier', RandomForestClassifier(n_jobs=-1, random_state=42))
])
rf_pipeline.fit(X_train, y_train)

In [9]:
from sklearn.metrics import classification_report
y_train_pred = rf_pipeline.predict(X_train)
training_report = str(classification_report(y_train, y_train_pred, target_names=y_train.columns, zero_division=0)).split('\n')
training_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.99      0.99      0.99      1624',
 '                     Data or business analyst       1.00      0.99      0.99      1382',
 'Data scientist or machine learning specialist       1.00      0.99      1.00      1781',
 '                       Database administrator       1.00      0.99      0.99      1252',
 '                            DevOps specialist       1.00      0.99      1.00      1525',
 '                        Developer, QA or test       1.00      0.99      0.99      1059',
 '                          Developer, back-end       1.00      0.99      1.00      4029',
 'Developer, desktop or enterprise applications       0.99      0.99      0.99      1831',
 '  Developer, embedded applications or devices       0.99      0.98      0.99      1251',
 '                         Developer, front-end       1.00      0.99      0.99      1

In [10]:
from sklearn.metrics import classification_report
y_test_pred = rf_pipeline.predict(X_test)
testing_report = str(classification_report(y_test, y_test_pred, target_names=y_train.columns, zero_division=0)).split('\n')
testing_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.92      0.54      0.68       672',
 '                     Data or business analyst       0.93      0.45      0.61       582',
 'Data scientist or machine learning specialist       0.85      0.60      0.70       740',
 '                       Database administrator       0.97      0.44      0.60       524',
 '                            DevOps specialist       0.99      0.32      0.48       637',
 '                        Developer, QA or test       0.97      0.58      0.72       445',
 '                          Developer, back-end       0.77      0.41      0.53      1719',
 'Developer, desktop or enterprise applications       0.88      0.31      0.46       829',
 '  Developer, embedded applications or devices       0.86      0.41      0.55       518',
 '                         Developer, front-end       0.83      0.30      0.44       

We observe a strong sign of overfitting. Let's fine tune our model.

### Fine Tunining

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'classifier__n_estimators': [200, 500],
    'classifier__max_depth': [5, 10,None],
    'pca__n_components': [0.7,0.85, 0.95]
}
grid_search = GridSearchCV(rf_pipeline,
             param_grid=param_grid,
             n_jobs=-1)
grid_search.fit(X_train, y_train)


KeyboardInterrupt: 

In [11]:
from sklearn.decomposition import PCA
rf_pipeline = Pipeline(steps=[
    ('scaler', RobustScaler()),
    ('pca', PCA(n_components=0.7)),
    ('classifier', RandomForestClassifier(n_jobs=-1, random_state=42,n_estimators=500))
])
rf_pipeline.fit(X_train, y_train)

In [12]:
grid_search.best_params_

NameError: name 'grid_search' is not defined

In [None]:
rf_pipeline = grid_search.best_estimator_
rf_pipeline

In [13]:
y_train_pred = rf_pipeline.predict(X_train)
training_report = str(classification_report(y_train, y_train_pred, target_names=y_train.columns, zero_division=0)).split('\n')
training_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.99      0.99      0.99      1624',
 '                     Data or business analyst       1.00      0.99      0.99      1382',
 'Data scientist or machine learning specialist       1.00      0.99      1.00      1781',
 '                       Database administrator       1.00      0.99      1.00      1252',
 '                            DevOps specialist       1.00      0.99      1.00      1525',
 '                        Developer, QA or test       1.00      0.99      0.99      1059',
 '                          Developer, back-end       1.00      0.99      1.00      4029',
 'Developer, desktop or enterprise applications       1.00      0.99      0.99      1831',
 '  Developer, embedded applications or devices       0.99      0.98      0.99      1251',
 '                         Developer, front-end       1.00      0.99      0.99      1

In [14]:
y_test_pred = rf_pipeline.predict(X_test)
testing_report = str(classification_report(y_test, y_test_pred, target_names=y_train.columns, zero_division=0)).split('\n')
testing_report

['                                               precision    recall  f1-score   support',
 '',
 '                          Academic researcher       0.88      0.53      0.67       672',
 '                     Data or business analyst       0.89      0.45      0.60       582',
 'Data scientist or machine learning specialist       0.84      0.64      0.73       740',
 '                       Database administrator       0.97      0.44      0.60       524',
 '                            DevOps specialist       0.94      0.32      0.48       637',
 '                        Developer, QA or test       0.97      0.58      0.72       445',
 '                          Developer, back-end       0.73      0.44      0.55      1719',
 'Developer, desktop or enterprise applications       0.86      0.32      0.47       829',
 '  Developer, embedded applications or devices       0.85      0.40      0.55       518',
 '                         Developer, front-end       0.79      0.33      0.46       

#### Export Model

In [15]:
with open(EXPORT_MODEL_PATH, 'wb') as handle:
    pickle.dump(rf_pipeline, handle)


### Track with Mlflow


In [16]:
# Initialize client and experiment
client = MlflowClient()
mlflow.set_experiment(EXPERIMENT_NAME)
exp = client.get_experiment_by_name(EXPERIMENT_NAME)


In [17]:
import os
data_details = {
    'data_path': DF_PATH,
    'training_indices': X_train.index.tolist(),
    'teseting_indices': X_test.index.tolist(),
    'features_names': X.columns.tolist(),
    'target_names': y.columns.tolist(),
}
with open(os.path.join(LOG_PATH, LOG_DATA_PKL), 'wb') as lg_file:
    pickle.dump(data_details, lg_file)

In [18]:
model = {
    'description': "Random Forest Classifier with PCA",
    'model_object': rf_pipeline,
    'model_detailes': str(rf_pipeline),
}
with open(os.path.join(LOG_PATH, LOG_MODEL_PKL), 'wb') as lg_file:
    pickle.dump(model, lg_file)

In [19]:
metrics_details = {
    'training_details': training_report,
    'testing_details': testing_report
}
with open(os.path.join(LOG_PATH, LOG_METRICS_PKL), 'wb') as lg_file:
    pickle.dump(metrics_details, lg_file)

In [20]:
# Start a new run and track 
with mlflow.start_run(experiment_id=exp.experiment_id):
    mlflow.log_artifact(LOG_PATH)   
    mlflow.log_metric("precision",precision_score(y_test, y_test_pred, average='macro', zero_division=0) ) 


In [21]:
runs = mlflow.search_runs([exp.experiment_id])
runs

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.precision,tags.mlflow.source.type,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.runName
0,812636a81de04341b6d3ff99dd1037a6,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-07 21:40:04.696000+00:00,2025-04-07 21:40:15.006000+00:00,0.881547,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,resilient-lynx-809
1,a7718bdc0c484cadaf4bddabcf284ba9,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-07 21:22:06.248000+00:00,2025-04-07 21:22:14.094000+00:00,0.903895,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,bedecked-goat-587
2,790a3bc5454540509f833ecea3c4f26a,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-07 19:59:33.605000+00:00,2025-04-07 19:59:41.937000+00:00,0.888575,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,marvelous-ant-662
3,6ddaf4004caf4b8281f56d2021842706,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 21:27:23.831000+00:00,2025-04-04 21:27:23.885000+00:00,0.806944,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,skillful-yak-438
4,d762556a043e4f3bb98e11eb849289bb,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 20:39:13.618000+00:00,2025-04-04 20:39:20.763000+00:00,0.888575,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,mysterious-snake-200
5,156da497b9d1457c960ed997f2cd6da8,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:49:38.308000+00:00,2025-04-04 18:49:38.344000+00:00,0.552371,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,efficient-gnu-96
6,26b1d56d10d5443a9305dad31070c567,344207504652916142,FINISHED,file:c:/Users/Abdelhakiem/Documents/CodingLand...,2025-04-04 18:43:57.322000+00:00,2025-04-04 18:43:57.483000+00:00,,LOCAL,C:\Users\Abdelhakiem\AppData\Roaming\Python\Py...,Abdelhakiem,persistent-colt-336
