In [1]:
import pandas as pd
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
import joblib

df = sns.load_dataset('diamonds')
 
X = df.drop('price', axis = 1)
y = df['price'] 

df.info()

categorical_cols = ['cut', 'color', 'clarity']
numerical_cols = ['carat', 'depth', 'table', 'x', 'y', 'z']
numerical_pipeline = make_pipeline(
    SimpleImputer(strategy='median'),
    MinMaxScaler()
)
categorical_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(sparse_output=False)
)

column_transformer =make_column_transformer(
    (numerical_pipeline, numerical_cols),
    (categorical_pipeline, categorical_cols)
)

pipeline = make_pipeline(
    column_transformer,
    RandomForestRegressor(random_state=42)
)
pipeline.fit(X, y)
print('R2 en train', pipeline.score(X, y))

joblib.dump(pipeline, '../models/pipeline_regresion.joblib')

X_new = pd.DataFrame({
            'carat': [0.23],
            'cut': ['Ideal'],
            'color': ['E'],
            'clarity': ['SI2'],
            'depth':[61.5],
            'table': [55],
            'x':[3.95],
            'y':[3.98],
            'z':[2.43],                 
        })

y_pred = pipeline.predict(X_new)

y_pred

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB
R2 en train 0.99743404541846


array([391.19])