# Training, experiments

---

In [18]:
import os
import sys
from pathlib import Path
from matplotlib import pyplot as plt

import polars as pl

sys.path.append(os.path.abspath(".."))

project_root = Path.cwd().parent
data_csv_path = project_root / "data" / "raw" / "job_market.csv"

df = pl.read_csv(str(data_csv_path))

In [19]:
from src.data.preprocessing import preprocess

df = preprocess(df, "job_market")

In [20]:
from src.data.feature_engineering import build_features

df = build_features(df, "job_market")

Expr.map_elements is significantly slower than the native expressions API.
Only use if you absolutely CANNOT implement your logic otherwise.
Replace this expression...
  - pl.col("experience_required").map_elements(np.log1p)
with this one instead:
  + pl.col("experience_required").log1p()

  .map_elements(np.log1p)


In [21]:
df.head(5)

salary_mean,job_type_Full time,job_type_Full-time,job_type_Internship,job_type_Part-time,job_type_Remote,job_type_Unknown,job_type_Working student,job_type_berufseinstieg,job_type_berufserfahren,job_type_manager,job_type_professional / experienced,category_HR,category_Helpdesk,category_Marketing and Communication,category_Media Planning,category_Process Engineering,category_Recruitment and Selection,category_Remote,category_SAP/ERP Consulting,category_Social Media Manager,category_Software Development,category_Technology,category_Unknown,job_title_mte,company_mte,location_mte,backend_skills,frontend_skills,db_skills,ml_skills,infra_skills,tools_skills,skill_count,experience_sq,experience_log
f64,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,u8,f64,f64,f64,i8,i8,i8,i8,i8,i8,i8,f64,f64
202953.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,168046.888889,130487.73913,155055.5,0,1,0,0,1,3,5,49.0,2.079442
200362.5,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,161118.0,130429.275,153501.153846,1,1,0,0,1,2,5,81.0,2.302585
197964.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,161118.0,130487.73913,155055.5,1,1,1,1,3,0,7,64.0,2.197225
196812.5,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,161961.5,130487.73913,155055.5,2,1,0,1,2,1,7,121.0,2.484907
196292.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,168046.888889,128877.0,153501.153846,2,0,0,1,0,0,3,121.0,2.484907


In [22]:
df_pd = df.to_pandas()

In [23]:
target = "salary_mean"

X = df_pd.drop(columns=[target])
y = df_pd[target]

y.mean()

np.float64(121207.47773279352)

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

---

## LightGBM

In [35]:
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

def find_best_lgb(X_train, y_train):
    lgb_model = lgb.LGBMRegressor()

    param_grid = {
        "n_estimators": [300, 600],
        "learning_rate": [0.05],
        "num_leaves": [31, 64, 128],
        "min_data_in_leaf": [5, 20]
    }

    gs_lgb = GridSearchCV(
        lgb_model,
        param_grid,
        cv=5,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1
    )

    gs_lgb.fit(X_train, y_train)

    print("Best params:", gs_lgb.best_params_)
    print("Best CV RMSE:", -gs_lgb.best_score_)
    
    return gs_lgb

In [36]:
gs_lgb = find_best_lgb(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 121
[LightGBM] [Info] Number of data points in the train set: 197, number of used features: 17
[LightGBM] [Info] Start training from score 121705.269036
Best params: {'learning_rate': 0.05, 'min_data_in_leaf': 20, 'n_estimators': 600, 'num_leaves': 31}
Best CV RMSE: 6228.893473278769


In [None]:
df_big = df_pd.sample(n=1000, replace=True, random_state=42)

target = "salary_mean"

X_big = df_big.drop(columns=[target])
y_big = df_big[target]

y_big.mean()

X_train_big, X_test_big, y_train_big, y_test_big = train_test_split(
    X_big, y_big, test_size=0.2, random_state=42
)

In [38]:
gs_lgb_big = find_best_lgb(X_train_big, y_train_big)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000055 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 151
[LightGBM] [Info] Number of data points in the train set: 800, number of used features: 26
[LightGBM] [Info] Start training from score 119360.955000
Best params: {'learning_rate': 0.05, 'min_data_in_leaf': 5, 'n_estimators': 600, 'num_leaves': 31}
Best CV RMSE: 1287.0609831096867


In [39]:
best_lgb = gs_lgb.best_estimator_

y_pred_lgb = best_lgb.predict(X_test)

rmse_lgb = np.sqrt(mean_squared_error(y_test, y_pred_lgb))
r2_lgb = r2_score(y_test, y_pred_lgb)

print("RMSE:", rmse_lgb)
print("R2:", r2_lgb)

RMSE: 7924.24459478586
R2: 0.9474870570848563


In [42]:
from sklearn.model_selection import cross_val_score

cv_rmse_lgb = cross_val_score(
    best_lgb, X, y,
    cv=7,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE mean:", -cv_rmse_lgb.mean())

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 124
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 18
[LightGBM] [Info] Start training from score 113031.933649
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000116 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 125
[LightGBM] [Info] Number of data points in the train set: 211, number of used features: 17
[LightGBM] [Info] Start training from score 116672.488152
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000068 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 123
[LightGBM] [Info] Number of data points in the train set: 212, number of used features: 17
[LightGBM] [Info] Start tr

---

## Linear Regression

In [43]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

ridge_pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("model", Ridge())
])

param_grid_ridge = {
    "model__alpha": [0.1, 1.0, 10.0, 100.0]
}

gs_ridge = GridSearchCV(
    ridge_pipe,
    param_grid_ridge,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

gs_ridge.fit(X_train, y_train)

print("Best params:", gs_ridge.best_params_)
print("Best CV RMSE:", -gs_ridge.best_score_)

Best params: {'model__alpha': 10.0}
Best CV RMSE: 10332.05867105262


In [44]:
best_ridge = gs_ridge.best_estimator_

y_pred_ridge = best_ridge.predict(X_test)

rmse_ridge = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
r2_ridge = r2_score(y_test, y_pred_ridge)

print("RMSE:", rmse_ridge)
print("R2:", r2_ridge)

RMSE: 10887.62299763422
R2: 0.9008673395630158


In [45]:
cv_rmse_ridge = cross_val_score(
    best_ridge, X, y,
    cv=5,
    scoring="neg_root_mean_squared_error"
)

print("CV RMSE mean:", -cv_rmse_ridge.mean())

CV RMSE mean: 16371.226525701797
