In [None]:
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression
from sklearn import set_config
from sklearn.metrics import r2_score
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, QuantileTransformer, KBinsDiscretizer,PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
import numpy as np

set_config(transform_output='pandas')

data_pipeline = Pipeline([
    ('normalisation', ColumnTransformer([
        ('discretion', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform'), ['longitude', 'latitude']),
        ('categorical_columns', OneHotEncoder(sparse_output=False, categories=[df['ocean_proximity'].unique().tolist()]), ['ocean_proximity']),
        ('normalisation', QuantileTransformer(output_distribution='normal', copy=False), ['population', 'longitude', 'latitude', 'total_rooms', 'total_bedrooms']),
        ('log', FunctionTransformer(np.log1p), ['total_rooms', 'total_bedrooms']),
    ], remainder='passthrough')),
    ('encoder', ColumnTransformer([
        ('numerical_columns', StandardScaler(),
            [f"normalisation__{name}" for name in ['population', 'longitude', 'latitude', 'total_rooms', 'total_bedrooms']] +
            [f"log__{name}" for name in ['total_rooms', 'total_bedrooms']] +
            [f"remainder__{name}" for name in ['housing_median_age', 'households', 'median_income']]
        )
    ], remainder='passthrough')),
    ('imputer', KNNImputer(add_indicator=True, n_neighbors=10)),
    ('poly', PolynomialFeatures(degree=2)),
])

model = Pipeline([
    ('data_pipeline', data_pipeline),
    # ('predictor', RandomForestRegressor(n_estimators=100, max_depth=20))
    ('predictor', LinearRegression())
])

model.fit(X_train, y_train)
print(f"Score: {model.score(X_valid, y_valid)}")

data_pipeline.transform(X_test)
# data_pipeline.transform(X_test).hist(bins=30, figsize=(20,10), edgecolor='black');