In [None]:
import numpy as np
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression

from diamond import data

In [None]:
X_train, X_test, y_train, y_test = data.split(
    *data.get_X_y(data.clean(data.load_raw('datasets/diamonds/diamonds.csv'))))

In [None]:
sequential_encoder = make_column_transformer(
    (data.cut_grades_encoder, ['cut']),
    (data.color_encoder, ['color']),
    (data.clarity_encoder, ['clarity']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

column_log_transformer = make_column_transformer(
    (data.log_transformer, ['x', 'y', 'z', 'volume', 'carat']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline = Pipeline(
    steps=[
        ('encoder', sequential_encoder),
        ('volume_extractor', data.VolumeFeatureExtractor()),
        ('eccentricity_extractor', data.EccentricityFeatureExtractor()),
        ('table_extractor', data.TableDistanceExtractor()),
        ('depth_extractor', data.DepthDistanceExtractor()),
        ('log_transformer', column_log_transformer),
        ('scaler', StandardScaler()),
        ('selector', 'passthrough'),
        # ('linear', TransformedTargetRegressor(
        #     LinearRegression(), transformer=data.log_transformer))
        ('linear', TransformedTargetRegressor(
            LinearRegression(positive=True), func=np.log, inverse_func=np.exp))
    ]
)
pipeline.set_output(transform='pandas')

In [None]:
parameter_grid = {
    'selector': (
        data.make_feature_selector('carat'),
        data.make_feature_selector('volume'),
        data.make_feature_selector('carat', 'cut', 'color', 'clarity'),
        data.make_feature_selector('carat', 'cut', 'color', 'clarity',
                                   'eccentricity'),
        data.make_feature_selector('volume', 'cut', 'color', 'clarity'),
        data.make_feature_selector('volume', 'cut', 'color', 'clarity',
                                   'eccentricity'),
        data.make_feature_selector('carat', 'table', 'depth_distance',
                                   'eccentricity', 'color', 'clarity'),
        data.make_feature_selector('volume', 'table', 'depth_distance',
                                   'eccentricity', 'color', 'clarity')
    ),
    'linear__regressor__positive': [True, False]
}

search = GridSearchCV(pipeline, scoring='r2', param_grid=parameter_grid,
                      refit=True)
search.fit(X_train, y_train)

In [None]:
search.best_score_

In [None]:
for params, score in zip(search.cv_results_['params'],
                         search.cv_results_['mean_test_score']):
    print(params, score)

In [None]:
print(search.best_params_)
print(search.best_estimator_.named_steps['linear'].regressor_.coef_)
# print(search.best_estimator_.named_steps['linear'].regressor_.intercept_)