In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from pathlib import Path

pd.set_option('display.max_columns', None)

### IMPORT DATA

In [2]:
RAW_DATASET_PATH = "data/postings.csv"
raw_data = pd.read_csv(RAW_DATASET_PATH)

In [3]:
raw_data.columns

Index(['job_id', 'company_name', 'title', 'description', 'max_salary',
       'pay_period', 'location', 'company_id', 'views', 'med_salary',
       'min_salary', 'formatted_work_type', 'applies', 'original_listed_time',
       'remote_allowed', 'job_posting_url', 'application_url',
       'application_type', 'expiry', 'closed_time',
       'formatted_experience_level', 'skills_desc', 'listed_time',
       'posting_domain', 'sponsored', 'work_type', 'currency',
       'compensation_type', 'normalized_salary', 'zip_code', 'fips'],
      dtype='object')

### EDA

In [4]:
from ydata_profiling import ProfileReport

In [5]:
PROFILE_REPORT_PATH="eda_job_postings_dataset.html"

def generate_profile_report(df: pd.DataFrame, path: Path = PROFILE_REPORT_PATH) -> None:
    profile = ProfileReport(df, title="LinkedIn Job Postings")
    profile.to_notebook_iframe()
    profile.to_file(PROFILE_REPORT_PATH)

# generate_profile_report(raw_data)

### DATA PREPROCESSING

In [40]:
import nltk
from nltk.corpus import stopwords
from sentence_transformers import SentenceTransformer
import re
import json
import pickle
import os

##### Keep only a subset of the columns for our needs

In [41]:
COLUMNS_TO_KEEP = ['company_name'
                   ,'title'
                   ,'description'
                   ,'pay_period'
                   ,'max_salary'
                   ,'med_salary'
                   ,'min_salary'
                   ,'location'
                   ,'remote_allowed'
                   ,'work_type'
                   ,'currency']
jobs_data = raw_data[COLUMNS_TO_KEEP]

##### Create a new yearly salary column from existing salary columns to standardize this metric across the dataset

In [8]:
salary_columns: list[str] = ["max_salary","med_salary","min_salary"]
salary_period_type_column: str = "pay_period"

def convert_to_yearly_salary(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    df.loc[df[salary_period_type_column]=="HOURLY", salary_columns] = df.loc[df[salary_period_type_column]=="HOURLY", salary_columns]*2080
    df.loc[df[salary_period_type_column]=="WEEKLY", salary_columns] = df.loc[df[salary_period_type_column]=="WEEKLY", salary_columns]*52
    df.loc[df[salary_period_type_column]=="BIWEEKLY", salary_columns] = df.loc[df[salary_period_type_column]=="BIWEEKLY", salary_columns]*26
    df.loc[df[salary_period_type_column]=="MONTHLY", salary_columns] = df.loc[df[salary_period_type_column]=="MONTHLY", salary_columns]*12

    df["standardized_salary"] = df["med_salary"]
    
    df["avg_min_max"] = (df["max_salary"]+df["min_salary"])/2
    df.loc[df["standardized_salary"].isna()==True, "standardized_salary"] = df.loc[df["standardized_salary"].isna()==True,"avg_min_max"]

    return df

In [9]:
jobs_data = convert_to_yearly_salary(jobs_data)
jobs_data = jobs_data[jobs_data["standardized_salary"]>=0]
jobs_data = jobs_data.drop(columns=["max_salary","med_salary","min_salary","pay_period","avg_min_max"])

##### Filter data on usd currency

In [10]:
# value_counts: 
# USD    36058
# EUR        6
# CAD        3
# BBD        2
# AUD        2
# GBP        2

jobs_data = jobs_data[jobs_data["currency"]=="USD"]
jobs_data = jobs_data.drop(columns="currency")

##### Focus on full_time, contract, part_time work types

In [11]:
# value_counts: 
# FULL_TIME     29119
# CONTRACT       3848
# PART_TIME      2304
# TEMPORARY       394
# INTERNSHIP      247
# OTHER           138
# VOLUNTEER         8

jobs_data = jobs_data[jobs_data["work_type"].isin(["FULL_TIME","CONTRACT","PART_TIME"])]

##### Plot standardized salaries

In [12]:
px.box(jobs_data,x="standardized_salary",y="work_type")

##### Concatenate information-rich text columns

In [13]:
COLUMNS_TO_CONCATENATE = ['company_name', 'title', 'description']

jobs_data["title"] = jobs_data["title"].str.strip()
jobs_data[COLUMNS_TO_CONCATENATE] = jobs_data[COLUMNS_TO_CONCATENATE].fillna("-",)
jobs_data["augmented_description"] =  jobs_data[COLUMNS_TO_CONCATENATE].agg(' '.join, axis=1)

jobs_data = jobs_data.drop(columns=["company_name","title","description"])

##### Convert 'remote_allowed' column to a boolean column

In [14]:
jobs_data["remote_allowed"] = jobs_data["remote_allowed"].fillna(0).astype(int)

In [15]:
jobs_data = pd.get_dummies(jobs_data,columns=["work_type"])

##### Location column is the following shape: city,state. In order to avoid to much values in the location columns, we only keep the state column. However, the state column contains too many values too, so we replace each value by its frequency.

In [16]:
tmp_location = jobs_data["location"].str.split(',',expand=True) 
tmp_location.loc[tmp_location[1].isna(),1] = tmp_location.loc[tmp_location[1].isna(),0] 
jobs_data["state"] = tmp_location[1].str.strip()

with open('location_renaming_mapping.json', 'r') as json_file:
    location_renaming_mapping = json.load(json_file)

jobs_data["state"] = jobs_data["state"].replace(location_renaming_mapping)

jobs_data = jobs_data.drop(columns=["location"])

##### Remove stopwords in augmented description

In [17]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anthonybiel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
px.histogram(jobs_data['augmented_description'].apply(lambda x: len(x)))

In [19]:
def clean_description(text):
    text = re.sub(r'\W', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

jobs_data['augmented_description'] = jobs_data['augmented_description'].apply(clean_description)

In [20]:
px.histogram(jobs_data['augmented_description'].apply(lambda x: len(x)))

### Embed augmented description

In [21]:
def save_file(data, filename):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def load_file(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [22]:
model = SentenceTransformer("all-MiniLM-L6-v2")
augmented_description_embeddings_path = Path("augmented_description_embeddings.pkl")

if not os.path.exists(augmented_description_embeddings_path):
    augmented_description_list = jobs_data["augmented_description"].to_list()
    augmented_description_embeddings = model.encode(augmented_description_list)
    jobs_data = jobs_data.drop(columns=["augmented_description"])
    save_file(augmented_description_embeddings,"augmented_description_embeddings.pkl")

In [23]:
if os.path.exists(augmented_description_embeddings_path):
    jobs_data = jobs_data.reset_index(drop=True)
    jobs_data = jobs_data.drop(columns=["augmented_description"])
    augmented_description_embeddings = load_file(augmented_description_embeddings_path)
    columns_embeddings_df = [str(c) for c in range(augmented_description_embeddings.shape[1])]
    jobs_data = pd.concat([jobs_data,pd.DataFrame(augmented_description_embeddings, columns=columns_embeddings_df)],axis=1,join='inner')

### Pipeline

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import (
    HistGradientBoostingRegressor,
    RandomForestRegressor,
    StackingRegressor,
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neural_network import MLPRegressor

from scipy.stats import randint, uniform, loguniform

import mlflow
from mlflow.models import infer_signature

from tqdm import tqdm

os.environ["TOKENIZERS_PARALLELISM"] = "false"

##### Train/Test split

In [25]:
X = jobs_data.drop(columns=["standardized_salary"])
y = jobs_data["standardized_salary"]

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8,random_state=42)

##### Transformers

In [26]:
class FrequencyEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.freq_map = {}

    def fit(self, X: pd.Series, y=None):
        # Compute state frequency on training set
        self.freq_map = X.value_counts(normalize=False).to_dict()
        return self

    def transform(self, X: pd.Series):
        # Encode the states in the test data set according to the frequencies calculated in the training set.
        # If a state was not in the training set, we assign it a frequency of 1.
        freq_map_ = self.freq_map.copy()
        
        for x in X: 
            if x not in self.freq_map.keys():
                freq_map_[x] = 1

        return pd.DataFrame(X.map(freq_map_))
    
    def set_output(self, transform="pandas"):
        self._transform_output = transform
        return self
    
def frequency_transformer() -> ColumnTransformer:
    return ColumnTransformer(transformers=[('frequency_encoding',FrequencyEncoder(),'state')]
                             ,remainder='passthrough'
                             ,verbose_feature_names_out=False)

pd.set_option('future.no_silent_downcasting', True)

##### Estimators

In [27]:
# Tracking URI of the Mlflow server.
MLFLOW_TRACKING_URI = "http://127.0.0.1:8080"

# This command sets the tracking URI for the current session
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

In [28]:
def make_pipeline_with_model(model: BaseEstimator) -> Pipeline:
    pipeline = Pipeline(
        [
            ('frequency_tranformer',frequency_transformer()),
            ('model',model)
        ]
    )
    pipeline.set_output(transform="pandas")
    return pipeline


def model_selection(
    X_train: pd.DataFrame,
    y_train: pd.DataFrame,
    models: list[(BaseEstimator,dict)],
    k_fold:int,
    experiment_name: str,
    run_name: str
) -> None:

    mlflow.set_experiment(experiment_name)
    for mp in models:
        model,param=mp
        model_name = model.__class__.__name__

        with mlflow.start_run(run_name=run_name+" "+model_name):
            pipeline = make_pipeline_with_model(model)
            cv = GridSearchCV(pipeline,param_grid=param,cv=k_fold,scoring="neg_root_mean_squared_error",refit=True)
            best_model = cv.fit(X_train,y_train)
        
            mlflow.log_metric("rmse",-best_model.best_score_)
            
            mlflow.sklearn.log_model(
                sk_model=best_model,
                artifact_path=model_name,
                signature=infer_signature(X_train, best_model.predict(X_train)),
                input_example=X_train,
                registered_model_name=model_name)

##### Estimators -> Baseline

In [29]:
hyperparameter_grid_baseline = [(RandomForestRegressor(), {})]

In [30]:
# model_selection(X_train,y_train,hyperparameter_grid_baseline,10,"Model selection","Baseline")

##### Estimators -> Model selection

##### Estimators -> Model selection -> Refining search space using RandomSearchCV

In [46]:
exhaustive_hyperparameter_grid = [(MLPRegressor(), {
                                'model__hidden_layer_sizes': [(50,50),(50,100),(100,50),(100,100)],
                                'model__learning_rate_init': loguniform(1e-4, 1),
                                'model__learning_rate': ["constant", "adaptive"],
                                'model__alpha': loguniform(1e-4, 1)
                            }), 
                                (Ridge(), {
                                'model__alpha': uniform(loc=0,scale=3)
                            }), (KNeighborsRegressor(), {
                                'model__n_neighbors': randint(5,30),  # Number of neighbors to use
                                # 'model__weights': ['uniform', 'distance'],  # Weight function used in prediction
                                # 'model__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],  # Algorithm used to compute the nearest neighbors
                                # 'model__p': [1, 2]  # Power parameter for the Minkowski distance
                            }), (DecisionTreeRegressor(), {
                                # 'model__criterion': ['squared_error', 'friedman_mse', 'absolute_error'],  # The function to measure the quality of a split
                                'model__max_depth': randint(5,50),  # Maximum depth of the tree
                                'model__min_samples_split': randint(5,20),  # Minimum number of samples required to split a node
                                'model__min_samples_leaf': randint(5,50)  # Minimum number of samples required to be at a leaf node
                            # }), (RandomForestRegressor(), {
                            #     'model__n_estimators': randint(50,250),  # Number of trees in the forest
                            #     'model__max_depth': randint(5,50),  # Maximum depth of the tree
                            #     'model__min_samples_split': randint(5,20),  # Minimum number of samples required to split a node
                            #     'model__min_samples_leaf': randint(5,50),  # Minimum number of samples required to be at a leaf node
                            #     # 'model__bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
                            }), (HistGradientBoostingRegressor(), {
                                'model__learning_rate': loguniform(1e-4, 1),  # Shrinks the contribution of each tree
                                'model__max_iter': randint(5,50),  # The number of boosting iterations
                                'model__max_leaf_nodes': randint(5,200),  # Maximum number of leaves for each tree
                                'model__min_samples_leaf': randint(5,50) # Minimum number of samples per leaf
                            })]

In [32]:
# cv_results: list[(str, pd.DataFrame)] = []

# # Utiliser tqdm pour suivre la progression
# for mp in tqdm(exhaustive_hyperparameter_grid, desc="Hyperparameter Tuning"):
#     model, param = mp

#     pipeline = make_pipeline_with_model(model)
#     random_search = RandomizedSearchCV(pipeline, param, n_iter=10, cv=10, scoring="neg_root_mean_squared_error", n_jobs=-1)
#     random_search.fit(X_train, y_train)

#     cv_results.append((model.__class__.__name__, random_search.cv_results_))

In [35]:
cv_results_path = Path("cv_results.pkl")

if not os.path.exists(cv_results_path):
    save_file(cv_results,cv_results_path)
else:
    cv_results = load_file(cv_results_path)

In [39]:
pd.DataFrame(cv_results[3][1]).sort_values(by="rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__max_depth,param_model__min_samples_leaf,param_model__min_samples_split,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
45,37.747917,1.549364,0.013738,0.002571,49,47,6,"{'model__max_depth': 49, 'model__min_samples_l...",-10130630.0,-10549900.0,-896859.2,-878436.7,-930374.5,-3586436.0,-2583016.0,-6819271.0,-5512901.0,-2497166.0,-4438499.0,3493333.0,1
22,22.481269,1.056023,0.007444,0.002829,39,47,13,"{'model__max_depth': 39, 'model__min_samples_l...",-10130630.0,-10549900.0,-896859.2,-878436.7,-930374.5,-3586436.0,-2583016.0,-6819271.0,-5512901.0,-2497166.0,-4438499.0,3493333.0,1
19,9.664395,0.107256,0.006826,0.00254,9,45,16,"{'model__max_depth': 9, 'model__min_samples_le...",-10130170.0,-10549400.0,-989157.0,-846973.2,-908376.5,-3580758.0,-2555057.0,-6816576.0,-5489153.0,-2550118.0,-4441573.0,3487192.0,3
31,8.689336,0.22248,0.011685,0.00309,5,39,19,"{'model__max_depth': 5, 'model__min_samples_le...",-10114160.0,-10542580.0,-945003.1,-892566.9,-975091.2,-3599037.0,-2534594.0,-6820096.0,-5519962.0,-2477343.0,-4442044.0,3482290.0,4
17,19.716298,0.945918,0.007519,0.002741,47,46,16,"{'model__max_depth': 47, 'model__min_samples_l...",-10128470.0,-10551260.0,-988157.1,-865231.2,-962585.8,-3583290.0,-2555761.0,-6815199.0,-5507512.0,-2505285.0,-4446276.0,3482830.0,5
5,19.614292,0.868982,0.006058,0.001395,28,45,15,"{'model__max_depth': 28, 'model__min_samples_l...",-10130340.0,-10549960.0,-995925.1,-893501.1,-966065.3,-3582586.0,-2563806.0,-6818250.0,-5493151.0,-2554643.0,-4454823.0,3475514.0,6
40,42.51374,1.914375,0.015871,0.0062,25,43,13,"{'model__max_depth': 25, 'model__min_samples_l...",-10134240.0,-10551250.0,-1033472.0,-915327.9,-966893.2,-3585771.0,-2548790.0,-6823384.0,-5513750.0,-2533574.0,-4460645.0,3473212.0,7
41,15.010957,0.381361,0.012772,0.003436,7,34,10,"{'model__max_depth': 7, 'model__min_samples_le...",-10114730.0,-10545430.0,-1078375.0,-921434.6,-1200497.0,-3587659.0,-2553414.0,-6825665.0,-5494083.0,-2555200.0,-4487648.0,3439068.0,8
43,23.357389,0.524431,0.010845,0.001064,12,41,15,"{'model__max_depth': 12, 'model__min_samples_l...",-10114860.0,-10549780.0,-1074260.0,-1006102.0,-1101340.0,-3614088.0,-2557667.0,-6824511.0,-5526996.0,-2528299.0,-4489790.0,3442709.0,9
46,35.232212,0.741592,0.011454,0.001633,21,39,5,"{'model__max_depth': 21, 'model__min_samples_l...",-10117120.0,-10551640.0,-1029596.0,-1054192.0,-1080819.0,-3622444.0,-2573352.0,-6831435.0,-5533757.0,-2530971.0,-4492533.0,3444493.0,10


In [None]:
def make_pipeline_with_model(model: BaseEstimator) -> Pipeline:
    pipeline = Pipeline(
        [
            ('frequency_tranformer',frequency_transformer()),
            ('model',model)
        ]
    )
    pipeline.set_output(transform="pandas")
    return pipeline

for mp in exhaustive_hyperparameter_grid:
    model, param = mp

    pipeline = make_pipeline_with_model(model)
    fitted_pipeline = pipeline.fit(X_train,y_train)

    print(model.__class__.__name__)
    print(root_mean_squared_error(y_train,fitted_pipeline.predict(X_train)))