In [26]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

Import data prepared in previous steps (only data for 2018 survey, full time employment, and removed outliers) and used for all machine learning models. The target is hourly rate and predictors are economic sector (nace), company size (esize_class), gender, age class, profession (lpk), education. All these are categorical variables. Single numerical variable is experience in years.

In [27]:
data = pd.read_csv('../Data/LT_DU_data_for_ML.csv') 
data.head()

Unnamed: 0,nace,esize_class,gender,age_class,lpk,education,experience,target
0,C,1_49,M,40-49,p721,G2,13,8.2
1,C,1_49,F,40-49,p334,G2,0,2.51
2,M,50_249,F,40-49,p522,G2,18,2.19
3,M,50_249,F,40-49,p522,G2,12,2.19
4,M,50_249,F,14-29,p522,G2,0,2.19


One fifth of records is kept for testing of models.

In [28]:
y = data['target']
X = data.drop(columns='target')

X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.20, random_state=0)

print('Dataset lengths:', 'train', len(y_train), ', test', len(y_test))

Dataset lengths: train 26114 , test 6529


# Model and predictions

## Initial Gradient Boosting model

Initial Gradient Boosting model used to test pipline and estimate the baseline accuracy. Numerical feature is scaled with [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html) and categorical features are encoded using [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html).

In [33]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import GradientBoostingRegressor

numeric_preprocessor = Pipeline(steps=[("scaler", StandardScaler())])

categorical_preprocessor = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    [
        ("categorical", categorical_preprocessor, ['nace', 'esize_class', 'gender', 'age_class', 'lpk', 'education']),
        ("numerical", numeric_preprocessor, ['experience'])
    ],
    sparse_threshold=0
)

model = Pipeline([('prep', preprocessor), ('regr', GradientBoostingRegressor())])
model.fit(X=X_train, y=y_train)

Function to print model prediction RMSE and R2

In [30]:
def print_model_rmse_r2(model, X_train, y_train, X_test, y_test):
    """ 
    Function to print model prediction RMSE and R2
    for train and test datasets
        Parameters:
        model - model to evaluate
        X_train - train features
        y_train - train target 
        X_test - test features
        y_test - test target
    """
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred)).round(3)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred)).round(3)

    r2_train = r2_score(y_train, y_train_pred).round(3)
    r2_test = r2_score(y_test, y_test_pred).round(3)

    print('Train: RMSE=', rmse_train, ' R2=', r2_train,
        '\nTest: RMSE=', rmse_test, ' R2=', r2_test)

RMSE and R2 of initial model predicions for train and test datasets.

In [34]:
print_model_rmse_r2(model, X_train, y_train, X_test, y_test)

Train: RMSE= 1.765  R2= 0.519 
Test: RMSE= 1.774  R2= 0.512


## Randomized Search CV with Hist Gradient Boosting

We used Randomized Search cross validation to tune Hist version of GB model hyparameters. Hist version used to speed search.

In [35]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.ensemble import HistGradientBoostingRegressor
cv = RepeatedKFold(n_splits=5, n_repeats=3, random_state=1)
model = Pipeline([('prep', preprocessor), ('regr', HistGradientBoostingRegressor())])

grid = {
        'regr__learning_rate': np.linspace(0.05,  0.2,  4),
        'regr__max_iter': np.arange(100, 500, 50, dtype=int),
        'regr__min_samples_leaf': np.arange(15, 35, 5, dtype=int),
        'regr__max_depth':np.arange(5, 15, dtype=int)
        }
search = RandomizedSearchCV(model, grid, scoring='neg_root_mean_squared_error', n_iter = 15, cv=cv, n_jobs=-1)
results = search.fit(X=X_train,y=y_train)

results_pd = pd.DataFrame(results.cv_results_)
results_pd.filter(regex='rank|regr|mean_test_score|std_test_score',axis=1).sort_values('rank_test_score').head().T

Unnamed: 0,13,12,6,11,3
param_regr__min_samples_leaf,15.0,15.0,15.0,25.0,15.0
param_regr__max_iter,300.0,450.0,400.0,250.0,350.0
param_regr__max_depth,9.0,10.0,8.0,12.0,12.0
param_regr__learning_rate,0.1,0.1,0.1,0.1,0.1
mean_test_score,-1.682245,-1.683671,-1.6842,-1.684213,-1.684442
std_test_score,0.030208,0.030645,0.03215,0.031057,0.032401
rank_test_score,1.0,2.0,3.0,4.0,5.0


In [36]:
best_model=results.best_estimator_
best_model

In [37]:
print_model_rmse_r2(best_model, X_train, y_train, X_test, y_test)

Train: RMSE= 1.547  R2= 0.63 
Test: RMSE= 1.654  R2= 0.576


15 - 3 min. ????????????????Variability of parameter values is small for five best iterations and performance gain over untuned initial model is significant. The model with parameters from best iteration will be used for further analysis.

## Feature importance 

https://scikit-learn.org/stable/auto_examples/ensemble/plot_gradient_boosting_regression.html#sphx-glr-auto-examples-ensemble-plot-gradient-boosting-regression-py

# Model (Pipline) Serialization 

In [59]:
import joblib
joblib.dump(best_model, './Models/GBR_model.joblib')

['./Models/LM_model.joblib']