In [1]:
import mlflow
from mlflow.tracking import MlflowClient
import pandas as pd
import os
import pickle


In [2]:
MLFLOW_TRACKING_URI = '../models/mlruns'
MLFLOW_EXPERIMENT_NAME = 'skills_jobs_matching'
MLFLOW_RUN_ID = '812636a81de04341b6d3ff99dd1037a6'
LOG_PATH = '../models/logs'
LOG_DATA_PKL    =  "rf_data.pkl"
LOG_MODEL_PKL   =  "rf_model.pkl"
LOG_METRICS_PKL =  "rf_metrics.pkl"

# Loading Model and Data

MLflow Settup


In [3]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient()
run = mlflow.get_run(MLFLOW_RUN_ID)
artifact_path = run.info.artifact_uri

In [4]:
artifact_path

'file:c:/Users/Abdelhakiem/Documents/CodingLandscape/side_projects/job-skill-matcher/notebooks/../models/mlruns/344207504652916142/812636a81de04341b6d3ff99dd1037a6/artifacts'

In [5]:
model_path = '../models/mlruns/344207504652916142/812636a81de04341b6d3ff99dd1037a6/artifacts/temp/rf_model.pkl'
with open(model_path, "rb") as f:
    model = pickle.load(f)
model

{'description': 'Random Forest Classifier with PCA',
 'model_object': Pipeline(steps=[('scaler', RobustScaler()), ('pca', PCA(n_components=0.7)),
                 ('classifier',
                  RandomForestClassifier(n_estimators=500, n_jobs=-1,
                                         random_state=42))]),
 'model_detailes': "Pipeline(steps=[('scaler', RobustScaler()), ('pca', PCA(n_components=0.7)),\n                ('classifier',\n                 RandomForestClassifier(n_estimators=500, n_jobs=-1,\n                                        random_state=42))])"}

In [6]:
data_path = '../models/mlruns/344207504652916142/812636a81de04341b6d3ff99dd1037a6/artifacts/temp/rf_data.pkl'
with open(data_path, "rb") as f:
    data = pickle.load(f)
data.keys()

dict_keys(['data_path', 'training_indices', 'teseting_indices', 'features_names', 'target_names'])

In [7]:
classifier = model['model_object']
features_names = pd.Series(data['features_names'])
targets_names = pd.Series(data['target_names'])

In [8]:
pd.Series(data['features_names'])

0      skills_group_0
1      skills_group_1
2     skills_group_10
3     skills_group_11
4     skills_group_12
5     skills_group_13
6     skills_group_14
7     skills_group_15
8     skills_group_16
9     skills_group_17
10    skills_group_18
11    skills_group_19
12     skills_group_2
13    skills_group_20
14    skills_group_21
15    skills_group_22
16    skills_group_23
17     skills_group_3
18     skills_group_4
19     skills_group_5
20     skills_group_6
21     skills_group_7
22     skills_group_8
23     skills_group_9
dtype: object

In [9]:
targets_names

0                               Academic researcher
1                          Data or business analyst
2     Data scientist or machine learning specialist
3                            Database administrator
4                                 DevOps specialist
5                             Developer, QA or test
6                               Developer, back-end
7     Developer, desktop or enterprise applications
8       Developer, embedded applications or devices
9                              Developer, front-end
10                            Developer, full-stack
11                      Developer, game or graphics
12                                Developer, mobile
13                                   Engineer, data
14                                        Scientist
15                             System administrator
dtype: object

---
## Load Skills Clusters

In [10]:
import json
SKILLS_CLUSTERS_PATH = '../data/processed/skills_clusters.json'
with open(SKILLS_CLUSTERS_PATH, 'r') as f:
    skills_clusters = json.load(f)
skills_clusters = pd.Series(skills_clusters)

In [11]:
skills_clusters_df = pd.DataFrame([(cluster, skill) for cluster, skills in skills_clusters.items() for skill in skills], columns=['cluster_id', 'skill'])
skills_clusters_df

Unnamed: 0,cluster_id,skill
0,skills_group_0,Qt
1,skills_group_0,APL
2,skills_group_0,COBOL
3,skills_group_0,Clojure
4,skills_group_0,Crystal
...,...,...
120,skills_group_9,Django
121,skills_group_9,Oracle Cloud Infrastructure
122,skills_group_9,Oracle
123,skills_group_9,SQLite


---
### Predict Sample entry

In [12]:

def get_skills_clusters(sample_skills, skills_clusters_df, features_names):
    """
    Get the skills clusters.
    
    Args:
        sample_skills: List or Series of skills to cluster
        skills_clusters_df: DataFrame mapping skills to clusters
        features_names: List of all possible cluster IDs
        
    Returns:
        List of cluster frequencies for the given skills
    """
    # Filter skills that exist in our clusters dataframe
    valid_skills = skills_clusters_df[skills_clusters_df['skill'].isin(sample_skills)]
    
    # Count skills per cluster
    clusters_freq = valid_skills['cluster_id'].value_counts().reindex(features_names, fill_value=0)
    
    return clusters_freq.tolist()

def make_predictions(input_data, model, features_names, targets_names, skills_clusters_df):
    # Prepare the input data
    clusters = get_skills_clusters(input_data, skills_clusters_df, features_names)
    input_df = pd.DataFrame([clusters], columns=features_names)
    
    # Make predictions
    predictions = model.predict_proba(input_df)
    
    positive_probs= [prob[0][1] for prob in predictions]
    
    return pd.Series(positive_probs, 
          index=targets_names).sort_values(ascending=False)


    



In [13]:
sample_skills = pd.Series(['Scala','Python','Keras','Python'])

if not all(skill in skills_clusters_df['skill'].values for skill in sample_skills):
    print('Invalid skills provided.')
else:
    predictions = make_predictions(
        input_data=sample_skills,
        model=classifier,
        features_names=features_names,
        targets_names=targets_names,
        skills_clusters_df=skills_clusters_df
    )
    print("Predicted labels:\n", predictions)

Predicted labels:
 Academic researcher                              0.560942
Scientist                                        0.505415
Data or business analyst                         0.344776
Data scientist or machine learning specialist    0.334833
Engineer, data                                   0.076000
Developer, back-end                              0.031710
Developer, QA or test                            0.022000
Developer, embedded applications or devices      0.020333
Developer, desktop or enterprise applications    0.017167
Database administrator                           0.010286
DevOps specialist                                0.008000
Developer, full-stack                            0.006000
System administrator                             0.005667
Developer, front-end                             0.004000
Developer, game or graphics                      0.002000
Developer, mobile                                0.002000
dtype: float64


In [None]:
import sys
import os

# Step 1: Add the scripts directory to sys.path
scripts_path = os.path.abspath(os.path.join(os.getcwd(), '..', 'scripts'))
if scripts_path not in sys.path:
    sys.path.append(scripts_path)

# Step 2: Import your class
from predictor import predict_job_probabilities



ImportError: cannot import name 'predict_job_probabilities' from 'predictor' (c:\Users\Abdelhakiem\Documents\CodingLandscape\side_projects\job-skill-matcher\scripts\predictor.py)

In [None]:
predict_job_probabilities(['Scala','Python','Keras','Python'])
