In [None]:
import numpy as np
from sklearn.compose import make_column_transformer, TransformedTargetRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

from diamond import data

In [None]:
X_train, X_test, y_train, y_test = data.split(
    *data.get_X_y(data.clean(data.load_raw('datasets/diamonds/diamonds.csv'))))

In [None]:
sequential_encoder = make_column_transformer(
    (data.cut_grades_encoder, ['cut']),
    (data.color_encoder, ['color']),
    (data.clarity_encoder, ['clarity']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

column_log_transformer = make_column_transformer(
    (data.log_transformer, ['x', 'y', 'z', 'volume', 'carat']),
    remainder='passthrough',
    verbose_feature_names_out=False
)

pipeline = Pipeline(
    steps=(
        ('encoder', sequential_encoder),
        ('volume_extractor', data.VolumeFeatureExtractor()),
        ('eccentricity_extractor', data.EccentricityFeatureExtractor()),
        ('table_extractor', data.TableDistanceExtractor()),
        ('depth_extractor', data.DepthDistanceExtractor()),
        ('log_transformer', column_log_transformer),
        ('selector', 'passthrough'),
        ('scaler', StandardScaler()),
        ('linear', TransformedTargetRegressor(
            LinearRegression(), transformer=data.log_transformer))
    )
)
pipeline.set_output(transform='pandas')

In [None]:
pipeline.fit(X_train, y_train)

pipeline.score(X_train, y_train)

In [None]:
pipeline.predict(X_train)