# Imports

In [1]:
import polars as pl
import matplotlib.pyplot as plt
import xgboost as xgb
import lightgbm as lgb

import sklearn
from sklearn.model_selection import KFold, cross_val_score, RandomizedSearchCV
from sklearn.metrics import root_mean_squared_error
from sklearn.ensemble import RandomForestRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    OrdinalEncoder,
    OneHotEncoder,
    StandardScaler,
    FunctionTransformer,
)
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.impute import IterativeImputer, SimpleImputer

import scipy.stats as scpt

import numpy as np


In [2]:
np.random.seed(42)

# Reading data

In [3]:
train_data_base = pl.read_csv("data/train.csv")
test_data_base = pl.read_csv("data/test.csv")


In [4]:
def transform_datetime_to_int(df: pl.DataFrame):
    return df.with_columns(
        pl.col("PolicyStart").str.to_datetime().dt.date().cast(pl.Int64),
    ).with_columns(
        pl.col(pl.String).str.to_lowercase(),
    )


# Model 2

Previous modeling processes were deleted to not clutter the notebook. (and there wasn't anything smart).

Iterative linear regression imputation, outlier cleaning, basic depth estimation + n_estimators estimation

## Prerpocessing

In [5]:
train_data = transform_datetime_to_int(train_data_base)
test_data = transform_datetime_to_int(test_data_base).with_columns(
    pl.lit(-1.0).alias("Premium")
)

I unify all the data for iterative imputation, so the linear regression estimator it uses has more "context" for imputation. I know that it may lead to data leakage, but it yieled in higher score :=) so I decided to leave it here.

In [6]:
whole_data = pl.concat([train_data, test_data])

In [10]:
target_feature = "Premium"

categorical_features = [
    "PlanType",
    "PropertyType",
    "MaritalStatus",
    "JobRole",
    "Feedback",
    "ResidenceType",
    "Smoking",
    "Sex",
    "ExerciseFreq",
    "EducationLevel",
]

numeric_features = [
    col_name
    for col_name in whole_data.columns
    if col_name not in categorical_features
    and col_name != target_feature
    and col_name != "RowId"
]

In [11]:
def one_hot_encoding(data: pl.DataFrame, features_to_encode: list[str]) -> pl.DataFrame:
    return data.to_dummies(features_to_encode, drop_first=True)

In [12]:
whole_data_encoded = one_hot_encoding(whole_data, categorical_features)

In [13]:
encoded_categorical = [
    col_name
    for col_name in whole_data_encoded.columns
    if col_name not in numeric_features
    and col_name != target_feature
    and col_name != "RowId"
]

In [15]:
old_col_names = whole_data_encoded.drop("Premium", "RowId").columns

In [16]:
imp = IterativeImputer(random_state=42)
imp.fit(whole_data_encoded.drop("RowId", "Premium"))



0,1,2
,estimator,
,missing_values,
,sample_posterior,False
,max_iter,10
,tol,0.001
,n_nearest_features,
,initial_strategy,'mean'
,fill_value,
,imputation_order,'ascending'
,skip_complete,False


In [17]:
whole_data_imputed_raw = pl.from_numpy(
    imp.transform(whole_data_encoded.drop("Premium", "RowId"))
)

In [18]:
new_col_names = whole_data_imputed_raw.columns

name_mapping = {
    new_name: old_name for old_name, new_name in zip(old_col_names, new_col_names)
}
whole_data_imputed = whole_data_imputed_raw.rename(name_mapping)

In [20]:
whole_data_imputed = pl.concat(
    [
        whole_data.select(pl.col("RowId")),
        whole_data_imputed,
        whole_data.select(pl.col("Premium")),
    ],
    how="horizontal",
)

Split the imputed data.

In [22]:
train_data_imputed = whole_data_imputed.filter(
    pl.col("RowId").le(train_data_base["RowId"].max())
)
test_data_imputed = whole_data_imputed.filter(
    pl.col("RowId").ge(test_data_base["RowId"].min())
)

Clean outliers, just throw out 1% of data that is located on "borders".

In [23]:
train_data_imputed = train_data_imputed.filter(
    (
        pl.col("CreditScore").quantile(0.005).le(pl.col("CreditScore"))
        & pl.col("CreditScore").le(pl.col("CreditScore").quantile(0.995))
    )
    & pl.col("RiskRating").le(pl.col("RiskRating").quantile(0.99))
    & pl.col("EarningsBracket").le(pl.col("EarningsBracket").quantile(0.99))
    & (
        pl.col("HealthIndex").quantile(0.005).le(pl.col("HealthIndex"))
        & pl.col("HealthIndex").le(pl.col("HealthIndex").quantile(0.995))
    )
    & (
        pl.col("VehicleAge").quantile(0.005).le(pl.col("VehicleAge"))
        & pl.col("VehicleAge").le(pl.col("VehicleAge").quantile(0.995))
    )
    & (
        pl.col("PolicyTenure").quantile(0.005).le(pl.col("PolicyTenure"))
        & pl.col("PolicyTenure").le(pl.col("PolicyTenure").quantile(0.995))
    )
    & pl.col("VitalityMetric").le(pl.col("VitalityMetric").quantile(0.99))
    & pl.col("AssetMaturity").le(pl.col("AssetMaturity").quantile(0.99))
    & (
        pl.col("EngagementLevel").quantile(0.005).le(pl.col("EngagementLevel"))
        & pl.col("EngagementLevel").le(pl.col("EngagementLevel").quantile(0.995))
    )
    & pl.col("AnnualIncome").le(pl.col("AnnualIncome").quantile(0.99))
)

In [24]:
train_statistics = train_data_imputed.select(
    *(
        pl.col(numeric_col_name).mean().name.suffix("Mean")
        for numeric_col_name in numeric_features
    ),
    *(
        pl.col(numeric_col_name).std().name.suffix("Std")
        for numeric_col_name in numeric_features
    ),
)

Normalize data. Folks in internet say it makes performance better.

In [25]:
train_data_imputed_normalized = train_data_imputed.with_columns(
    *(
        (pl.col(numeric_col_name) - train_statistics[f"{numeric_col_name}Mean"])
        / train_statistics[f"{numeric_col_name}Std"]
        for numeric_col_name in numeric_features
    ),
)

In [26]:
train_data_imputed_normalized.describe()

statistic,RowId,PlanType_basic,PlanType_comprehensive,PlanType_null,CreditScore,RiskRating,PolicyStart,PropertyType_apartment,PropertyType_condo,PropertyType_null,MaritalStatus_married,MaritalStatus_null,MaritalStatus_single,JobRole_employed,JobRole_self-employed,JobRole_unemployed,Feedback_average,Feedback_good,Feedback_null,ResidenceType_null,ResidenceType_rural,ResidenceType_suburban,EarningsBracket,HealthIndex,VehicleAge,Smoking_no,Smoking_null,CustomerAge,PrevClaims,Sex_male,Sex_null,PolicyTenure,VitalityMetric,AssetMaturity,EngagementLevel,BaselineIndex,Dependents,ExerciseFreq_monthly,ExerciseFreq_null,ExerciseFreq_rarely,ExerciseFreq_weekly,EducationLevel_bachelor's,EducationLevel_high school,EducationLevel_null,EducationLevel_phd,AnnualIncome,Premium
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0,683986.0
"""null_count""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",359829.015246,0.325836,0.32665,0.019857,1.2358e-16,-1.6413e-17,1.0614e-15,0.326745,0.326249,0.0199,0.321087,0.035166,0.322739,0.230629,0.230265,0.226271,0.308476,0.301247,0.08325,0.020035,0.327621,0.327622,1.6721e-16,-1.611e-16,-1.6235e-16,0.488754,0.020158,-5.726e-16,4.307e-17,0.492022,0.020095,-4.334e-17,4.3743e-16,-2.4117e-16,-1.4793e-17,-2.226e-16,1.1824e-16,0.244669,0.01997,0.244713,0.250591,0.247471,0.236514,0.019916,0.247527,-3.451e-17,1101.351207
"""std""",207653.779705,0.468687,0.468988,0.139509,1.0,1.0,1.0,0.469023,0.46884,0.139655,0.466894,0.184199,0.467524,0.421236,0.421003,0.418417,0.461864,0.4588,0.27626,0.140122,0.469346,0.469347,1.0,1.0,1.0,0.499874,0.140542,1.0,1.0,0.499937,0.140327,1.0,1.0,1.0,1.0,1.0,1.0,0.429891,0.139896,0.429917,0.433354,0.431543,0.424941,0.13971,0.431576,1.0,863.537222
"""min""",0.0,0.0,0.0,0.0,-2.092114,-2.182072,-1.753676,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.064488,-1.880257,-1.659677,0.0,0.0,-1.749171,-1.237746,0.0,0.0,-1.559807,-2.080797,-2.191798,-2.718345,-1.758369,-1.509266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.092239,20.0
"""25%""",180241.0,0.0,0.0,0.0,-0.775102,-0.774979,-0.84832,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.785979,-0.770394,-0.787851,0.0,0.0,-0.842916,-0.162013,0.0,0.0,-0.778227,-0.773438,-0.844842,-0.681449,-0.852168,-0.75789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.789502,514.0
"""50%""",359681.0,0.0,0.0,0.0,0.035246,0.034364,-0.00188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.230694,-0.002776,0.083974,0.0,0.0,-0.003064,-0.000448,0.0,0.0,0.003353,-0.023827,0.014548,-0.001463,-0.000351,-0.006515,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.229065,871.0
"""75%""",539546.0,1.0,1.0,0.0,0.793797,0.795896,0.822957,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.397588,0.696031,0.82919,1.0,0.0,0.818553,0.054833,1.0,0.0,0.784932,0.689416,0.8496,0.680152,0.852391,0.744861,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.406441,1506.0
"""max""",720000.0,1.0,1.0,1.0,1.830135,1.814866,1.830436,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.643941,2.291885,1.653259,1.0,1.0,1.724808,8.66064,1.0,1.0,1.566512,2.297881,1.903979,2.703969,1.759255,1.496237,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3.672061,4997.0


In [27]:
X = train_data_imputed_normalized.drop("RowId", "Premium")
y = train_data_imputed_normalized.select(pl.col("Premium"))

## Modeling

### Boosted trees
I tried different parameters for models, here I left only the best ones.

In [41]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)


params = {
    "max_depth": 10,
    "n_estimators": 20,
    "learning_rate": 0.1,
    "random_state": 42,
    "n_jobs": 10,
}

In [42]:
rmse = cross_val_score(
    xgb.XGBRegressor(**params),
    X=X,
    y=y,
    scoring="neg_root_mean_squared_error",
    cv=cv,
)

print("-" * 50)
print(-rmse.mean())
print("-" * 50)


--------------------------------------------------
840.9740112304687
--------------------------------------------------


### Random forest

In [44]:
random_forest = RandomForestRegressor(
    n_estimators=20,
    random_state=42,
    n_jobs=10,
    max_depth=10,
)

In [45]:
rmse = cross_val_score(
    random_forest,
    X=X.to_pandas(),
    y=y.to_numpy().ravel(),
    scoring="neg_root_mean_squared_error",
    cv=cv,
)

print("-" * 50)
print(-rmse.mean())
print("-" * 50)


--------------------------------------------------
840.2420093721688
--------------------------------------------------


---
PCA for this data

I tried to throw out some features, because I found that there are 100% correlated features in this data. However, folks in the internet say that Ensemble tree methods manage correlations just fine.

In [48]:
pca = PCA(n_components=40)
pca.fit(X)

0,1,2
,n_components,40
,copy,True
,whiten,False
,svd_solver,'auto'
,tol,0.0
,iterated_power,'auto'
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,


In [49]:
X_pca = pca.transform(X)

In [50]:
rmse = cross_val_score(
    random_forest,
    X=X_pca,
    y=y.to_numpy().ravel(),
    scoring="neg_root_mean_squared_error",
    cv=cv,
)

print("-" * 50)
print(-rmse.mean())
print("-" * 50)


--------------------------------------------------
854.8487250926219
--------------------------------------------------


Indeed it does not yield good results.

---

# Model 3

Better imputation, better categorical data encoding. Trying other models.
This this the end result that yielded the best result on the kaggle leaderboard for me. I also decided to use sklearn API more, for convenience of life (pipelines, column transformers, etc.).

In [51]:
train_data = transform_datetime_to_int(train_data_base)
test_data = transform_datetime_to_int(test_data_base).with_columns(
    pl.lit(-1.0).alias("Premium")
)

Here I concatenate whole data just to get all spectre of missing values for categorical features.

In [52]:
whole_data = pl.concat([train_data, test_data], how="vertical")

In [53]:
target_feature = "Premium"

categorical_features = [
    "PlanType",
    "PropertyType",
    "MaritalStatus",
    "JobRole",
    "Feedback",
    "ResidenceType",
    "Smoking",
    "Sex",
    "ExerciseFreq",
    "EducationLevel",
]

numeric_features = [
    col_name
    for col_name in whole_data.columns
    if col_name not in categorical_features
    and col_name != target_feature
    and col_name != "RowId"
]

For better imputation of categorical features I should investigate whether there are ordinal features. 

In [54]:
for cat_feature in categorical_features:
    print(whole_data[cat_feature].unique())

shape: (4,)
Series: 'PlanType' [str]
[
	null
	"comprehensive"
	"basic"
	"premium"
]
shape: (4,)
Series: 'PropertyType' [str]
[
	"house"
	"apartment"
	null
	"condo"
]
shape: (4,)
Series: 'MaritalStatus' [str]
[
	"married"
	"divorced"
	"single"
	null
]
shape: (4,)
Series: 'JobRole' [str]
[
	"self-employed"
	"employed"
	"unemployed"
	null
]
shape: (4,)
Series: 'Feedback' [str]
[
	"average"
	"poor"
	"good"
	null
]
shape: (4,)
Series: 'ResidenceType' [str]
[
	"urban"
	"suburban"
	null
	"rural"
]
shape: (3,)
Series: 'Smoking' [str]
[
	null
	"yes"
	"no"
]
shape: (3,)
Series: 'Sex' [str]
[
	"female"
	null
	"male"
]
shape: (5,)
Series: 'ExerciseFreq' [str]
[
	null
	"daily"
	"rarely"
	"monthly"
	"weekly"
]
shape: (5,)
Series: 'EducationLevel' [str]
[
	null
	"master's"
	"phd"
	"bachelor's"
	"high school"
]


I note the following features to be ordinal: `PlanType`, `Feedback`, `ExerciseFreq`, `EducationLevel`.

In [55]:
ordinal_features = {
    "PlanType": ["Unknown", "basic", "comprehensive", "premium"],
    "Feedback": ["Unknown", "poor", "average", "good"],
    "ExerciseFreq": ["Unknown", "rarely", "monthly", "weekly", "daily"],
    "EducationLevel": ["Unknown", "high school", "bachelor's", "master's", "phd"],
}

There are also binary features: `Smoking`, `Sex`. It could be a good idea to encode them without one-hot but with labels, as for those features order does matter. But if the have missing values it will be nonsense.

In [57]:
whole_data.select(pl.col("Smoking"), pl.col("Sex")).null_count()

Smoking,Sex
u32,u32
21622,22116


They do have null's thus I will just use one hot encoding for them.

In [58]:
nominal_features = [
    col_name
    for col_name in categorical_features
    if col_name not in ordinal_features.keys()
]

In [59]:
ordinal_pipe = Pipeline(
    steps=[
        (
            "categorical_imputer",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
        ),
        (
            "ordinal_encoder",
            OrdinalEncoder(
                categories=[categories for categories in ordinal_features.values()],
                handle_unknown="use_encoded_value",
                unknown_value=-1,
            ),
        ),
    ]
)

In [60]:
nominal_pipe = Pipeline(
    steps=[
        (
            "categorical_imputer",
            SimpleImputer(strategy="constant", fill_value="Unknown"),
        ),
        ("onehot_encoder", OneHotEncoder(handle_unknown="warn", drop="first")),
    ]
)

For some reason simple imputation works better than iterative one. I tried median and mean imputations strategy. Mean turned out to work the best.

In [61]:
skewed_numerical_features = ["EarningsBracket", "AnnualIncome"]

unskewed_numerical_features = [
    feature for feature in numeric_features if feature not in skewed_numerical_features
]

skewed_numeric_pipe = Pipeline(
    steps=[
        ("log1p", FunctionTransformer(lambda x: np.log1p(x))),
        ("simple_imputer", SimpleImputer(strategy="mean")),
        ("feature_normalizer", StandardScaler()),
    ]
)

unskewed_numeric_pipe = Pipeline(
    steps=[
        ("simple_imputer", SimpleImputer(strategy="mean")),
        ("feature_normalizer", StandardScaler()),
    ]
)

In [62]:
preprocessor = ColumnTransformer(
    transformers=[
        ("ordinal_categorical", ordinal_pipe, list(ordinal_features.keys())),
        ("nominal_categorical", nominal_pipe, nominal_features),
        ("skewed_numerical", skewed_numeric_pipe, skewed_numerical_features),
        ("unskewed_numerical", unskewed_numeric_pipe, unskewed_numerical_features),
    ],
    remainder="passthrough",
)

In [63]:
rf = RandomForestRegressor(
    n_estimators=20,
    random_state=42,
    n_jobs=-1,
    max_depth=10,
)

I also tried log transforming target, and then transforming it back, but it works worse.

In [64]:
pipe = regressor = make_pipeline(preprocessor, rf)

In [65]:
X_train, y_train = (
    train_data.drop("RowId", "Premium").to_pandas(),
    train_data.select(pl.col("Premium")).to_pandas(),
)

In [66]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipe, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
scores, scores.mean(), scores.std()


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


(array([-845.69544863, -836.03744859, -840.88420662, -840.1897478 ,
        -842.74302566]),
 np.float64(-841.109975460298),
 np.float64(3.172435546950765))

In [67]:
pipe.fit(X_train, y_train)

  return fit_method(estimator, *args, **kwargs)


0,1,2
,steps,"[('columntransformer', ...), ('randomforestregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ordinal_categorical', ...), ('nominal_categorical', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Unknown', 'basic', ...], ['Unknown', 'poor', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'warn'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function <la...t 0x34a3d32e0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,20
,criterion,'squared_error'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [68]:
X_test = test_data.drop("RowId", "Premium")

In [69]:
def get_predicted_dataframe(test_data: pl.DataFrame, model) -> pl.DataFrame:
    predictions = model.predict(test_data)

    return pl.concat(
        [
            test_data_base.select(pl.col("RowId")),
            pl.from_numpy(predictions).select(pl.col("column_0").alias("Target")),
        ],
        how="horizontal",
    )


In [70]:
preds = get_predicted_dataframe(test_data, pipe)



In [71]:
preds.write_csv("data/sumbission_pipeline.csv")

---
Previous model worked unreasonably well (considering that I just guessed parameters). Now I want to try out other ensemble methods, together with grid search for parameters.

In [None]:
models_grid = {
    "random_forest": {
        "model": Pipeline(
            [
                ("preprocessoing", preprocessor),
                ("model", RandomForestRegressor(random_state=42, n_jobs=-1, verbose=3)),
            ]
        ),
        "parameters": {
            "model__n_estimators": scpt.randint(10, 100),
            "model__max_depth": scpt.randint(5, 30),
            "model__min_samples_split": scpt.randint(2, 15),
            "model__min_samples_leaf": scpt.randint(1, 15),
        },
    },
    "xgboost": {
        "model": Pipeline(
            [
                ("preprocessoing", preprocessor),
                ("model", xgb.XGBRegressor(random_state=42, n_jobs=-1, verbosity=3)),
            ]
        ),
        "parameters": {
            "model__n_estimators": scpt.randint(10, 150),
            "model__max_depth": scpt.randint(10, 50),
            "model__learning_rate": scpt.uniform(0.04, 0.1),
            "model__subsample": scpt.uniform(0.4, 0.9),
            "model__colsample_bytree": scpt.uniform(0.2, 0.8),
        },
    },
    "lgbm_gdbt": {
        "model": Pipeline(
            [
                ("preprocessoing", preprocessor),
                (
                    "model",
                    lgb.LGBMRegressor(
                        boosting_type="gbdt",
                        random_state=42,
                        metric="rmse",
                        verbose=3,
                    ),
                ),
            ]
        ),
        "parameters": {
            "model__n_estimators": scpt.randint(10, 150),
            "model__max_depth": scpt.randint(5, 40),
            "model__learning_rate": scpt.uniform(0.03, 0.1),
            "model__subsample": scpt.uniform(0.4, 0.9),
            "model__colsample_bytree": scpt.uniform(0.2, 0.8),
            "model__num_leaves": scpt.randint(20, 50),
            "model__min_child_samples": scpt.randint(10, 30),
        },
    },
    "lgbm_goss": {
        "model": Pipeline(
            [
                ("preprocessoing", preprocessor),
                (
                    "model",
                    lgb.LGBMRegressor(
                        boosting_type="goss",
                        random_state=42,
                        metric="rmse",
                        verbose=3,
                    ),
                ),
            ]
        ),
        "parameters": {
            "model__n_estimators": scpt.randint(10, 150),
            "model__max_depth": scpt.randint(5, 40),
            "model__learning_rate": scpt.uniform(0.03, 0.1),
            "model__subsample": scpt.uniform(0.4, 0.9),
            "model__colsample_bytree": scpt.uniform(0.2, 0.8),
            "model__num_leaves": scpt.randint(20, 50),
            "model__min_child_samples": scpt.randint(10, 30),
        },
    },
}

This is very promising grid search, that I did not had time to run, unfortunately. I started running it but comptetition ended quicker.

In [None]:
for model_type, model_params in models_grid.items():
    print("-" * 10, model_type, "-" * 10)
    model = model_params["model"]
    parameters = model_params["parameters"]
    random_grid = RandomizedSearchCV(
        model,
        parameters,
        n_iter=30,
        cv=5,
        scoring="neg_root_mean_squared_error",
        n_jobs=-1,
        verbose=3,
        random_state=42,
    )

    random_grid.fit(X_train, y_train.to_numpy().ravel())

    print(random_grid.best_params_)
    print(random_grid.best_score_)
    print(random_grid.best_estimator_)

Below you can see me trying boosted trees instead of random forest. I tried them in hope they will have less overfit for the data.

---
### XGBoost

In [73]:
xgb_iso = xgb.XGBRegressor(
    n_estimators=20,
    max_depth=10,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
    tree_method="hist",
)

In [74]:
pipe_xgb = make_pipeline(preprocessor, xgb_iso)

In [75]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipe_xgb, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
scores, scores.mean(), scores.std()


(array([-848.98303223, -839.23565674, -844.14044189, -843.07244873,
        -845.62701416]),
 np.float64(-844.21171875),
 np.float64(3.18850131258041))

---
### LightGBM

In [76]:
lgb_gbdt_iso = lgb.LGBMRegressor(
    boosting_type="gbdt",
    n_estimators=20,
    max_depth=10,
    num_leaves=2**10,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
)

pipe_lgb = make_pipeline(preprocessor, lgb_gbdt_iso)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipe_lgb, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
scores, scores.mean(), scores.std()


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2700
[LightGBM] [Info] Number of data points in the train set: 576000, number of used features: 35
[LightGBM] [Info] Start training from score 1101.456642


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2701
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Start training from score 1102.729745


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2700
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Start training from score 1102.189960


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003301 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2703
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Start training from score 1103.303097


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003986 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2702
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Start training from score 1102.800877




(array([-849.01801461, -839.34696799, -844.055335  , -843.09318498,
        -845.61012242]),
 np.float64(-844.2247249994498),
 np.float64(3.162168259127136))

In [57]:
pipe_lgb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2700
[LightGBM] [Info] Number of data points in the train set: 720001, number of used features: 35
[LightGBM] [Info] Start training from score 1102.496065


0,1,2
,steps,"[('columntransformer', ...), ('lgbmregressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('ordinal_categorical', ...), ('nominal_categorical', ...), ...]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,"[['Unknown', 'basic', ...], ['Unknown', 'poor', ...], ...]"
,dtype,<class 'numpy.float64'>
,handle_unknown,'use_encoded_value'
,unknown_value,-1
,encoded_missing_value,
,min_frequency,
,max_categories,

0,1,2
,missing_values,
,strategy,'constant'
,fill_value,'Unknown'
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'warn'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,func,<function <la...t 0x131b142c0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'mean'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,boosting_type,'gbdt'
,num_leaves,1024
,max_depth,10
,learning_rate,0.05
,n_estimators,20
,subsample_for_bin,200000
,objective,
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


---
### LightGBM goss

In [77]:
lgb_gbdt_iso = lgb.LGBMRegressor(
    boosting_type="goss",
    n_estimators=20,
    max_depth=10,
    num_leaves=2**10,
    learning_rate=0.05,
    random_state=42,
    n_jobs=-1,
)

pipe_lgb_goss = make_pipeline(preprocessor, lgb_gbdt_iso)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(
    pipe_lgb_goss, X_train, y_train, cv=cv, scoring="neg_root_mean_squared_error"
)
scores, scores.mean(), scores.std()


  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003462 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2700
[LightGBM] [Info] Number of data points in the train set: 576000, number of used features: 35
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1101.456642






  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003013 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2701
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1102.729745






  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003208 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2700
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1102.189960






  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2703
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1103.303097






  y = column_or_1d(y, warn=True)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002839 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2702
[LightGBM] [Info] Number of data points in the train set: 576001, number of used features: 35
[LightGBM] [Info] Using GOSS
[LightGBM] [Info] Start training from score 1102.800877




(array([-849.01801461, -839.34696799, -844.055335  , -843.09318498,
        -845.61012242]),
 np.float64(-844.2247249994498),
 np.float64(3.162168259127136))