In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import make_column_selector, make_column_transformer
import joblib

df = sns.load_dataset('diamonds')

X = df.drop('cut', axis=1)
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['cut'])  

In [2]:
# Alternativa más corta
column_transformer = make_column_transformer(
    (
        make_pipeline(
            SimpleImputer(strategy='median'),
            MinMaxScaler()
        ),
        make_column_selector(dtype_include='number') 
    ),
    (
        make_pipeline(
            SimpleImputer(strategy='most_frequent'),
            OneHotEncoder(sparse_output=False)
        ),
        make_column_selector(dtype_include=['object', 'category']) # detecta automaticamente columnas categóricas
    )
)

pipeline = make_pipeline(column_transformer, RandomForestClassifier(random_state=42))
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))
joblib.dump(pipeline, '../models/pipeline_clasification.joblib')

R2 en train 0.999888765294772


['../models/pipeline_clasification.joblib']