In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
import joblib

# Cargar dataset
df = sns.load_dataset('diamonds')
X = df.drop('price', axis=1)
y = df['price'] 

# Definir columnas
categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']

# Pipeline para columnas numéricas
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)

# Pipeline para columnas categóricas
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)

# Transformación de columnas
column_transformer = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_cols),
        ('cat', categorical_pipeline, categorical_cols)
    ]
)

# Pipeline final con RandomForestRegressor
pipeline = make_pipeline(column_transformer, RandomForestRegressor(random_state=42))

# Entrenar el modelo
pipeline.fit(X, y)

# Mostrar R2 en el conjunto de entrenamiento
print('R2 en train:', pipeline.score(X, y))

# Guardar el modelo
joblib.dump(pipeline, '../models/pipeline_regresion.joblib')

# Predicción para un nuevo dato
X_new = pd.DataFrame({
    'carat': [0.23],
    'cut': ['Ideal'],
    'color': ['E'],
    'clarity': ['SI2'],
    'depth': [61.5],
    'table': [55],
    'x': [3.95],
    'y': [3.98],
    'z': [2.43],                 
})

y_pred = pipeline.predict(X_new)
print('Predicción para el nuevo dato:', y_pred)

R2 en train: 0.99743404541846
Predicción para el nuevo dato: [391.19]
