In [10]:
import pathlib
import pickle
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler

DATA_DIR = pathlib.Path.cwd().parent / 'data'
processed_file_path = DATA_DIR / 'processed' / 'ames_with_correct_types.pkl'

# Carregar dados
with open(processed_file_path, 'rb') as file:
    (
        data,
        continuous_variables,
        discrete_variables,
        ordinal_variables,
        categorical_variables,
    ) = pickle.load(file)

# Funções de transformação de dados
def filter_columns(data):
    """Filtra colunas não desejadas e manipula valores específicos."""
    data = data[~data['MS.Zoning'].isin(['A (agr)', 'C (all)', 'I (all)'])]
    data['MS.Zoning'] = data['MS.Zoning'].cat.remove_unused_categories()
    
    return data

def remap_categories(data):
    """Redefine categorias em colunas específicas."""
    data['Sale.Type'] = data['Sale.Type'].cat.add_categories(['GroupedWD', 'Other'])
    data['Sale.Type'] = data['Sale.Type'].replace(['WD ', 'CWD', 'VWD'], 'GroupedWD')
    data['Sale.Type'] = data['Sale.Type'].replace(['COD', 'ConLI', 'Con', 'ConLD', 'Oth', 'ConLw'], 'Other')
    # Repita para outras colunas categóricas conforme necessário
    return data

def add_features(data):
    """Adiciona colunas calculadas para as idades da casa e da garagem."""
    data['Garage.Age'] = data['Yr.Sold'] - data['Garage.Yr.Blt'].fillna(data['Garage.Yr.Blt'].median())
    data['Remod.Age'] = data['Yr.Sold'] - data['Year.Remod.Add']
    data['House.Age'] = data['Yr.Sold'] - data['Year.Built']
    data['Lot.Frontage'] = data['Lot.Frontage'].fillna(data['Lot.Frontage'].median())
    return data

# Configuração da ColumnTransformer para dados categóricos e numéricos
categorical_cols = categorical_variables + ordinal_variables
numerical_cols = continuous_variables + discrete_variables

preprocessor = ColumnTransformer([
    ('categoricals', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numericals', StandardScaler(), numerical_cols),
])

# Definindo a Pipeline
pipeline = Pipeline([
    ('filter_columns', FunctionTransformer(filter_columns, validate=False)),
    ('remap_categories', FunctionTransformer(remap_categories, validate=False)),
    ('add_features', FunctionTransformer(add_features, validate=False)),
    ('preprocessing', preprocessor)
])

# Executando a Pipeline nos dados
data_transformed = pipeline.fit_transform(data)
print(data_transformed)


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 232080 stored elements and shape (2901, 341)>
  Coords	Values
  (0, 0)	1.0
  (0, 18)	1.0
  (0, 21)	1.0
  (0, 24)	1.0
  (0, 28)	1.0
  (0, 29)	1.0
  (0, 49)	1.0
  (0, 64)	1.0
  (0, 73)	1.0
  (0, 79)	1.0
  (0, 86)	1.0
  (0, 95)	1.0
  (0, 99)	1.0
  (0, 109)	1.0
  (0, 132)	1.0
  (0, 142)	1.0
  (0, 145)	1.0
  (0, 151)	1.0
  (0, 157)	1.0
  (0, 159)	1.0
  (0, 170)	1.0
  (0, 171)	1.0
  (0, 178)	1.0
  (0, 180)	1.0
  (0, 184)	1.0
  :	:
  (2900, 316)	-0.09978760068925502
  (2900, 317)	0.9840761160179055
  (2900, 318)	0.8210573311919769
  (2900, 319)	0.7535250746016613
  (2900, 320)	0.006765345843651345
  (2900, 321)	-0.35317427864900647
  (2900, 322)	-0.10267336094353142
  (2900, 323)	-0.28639090724790933
  (2900, 324)	-0.06334656998175685
  (2900, 325)	-0.08951187219829294
  (2900, 326)	0.07760221633898341
  (2900, 327)	0.7068675669662986
  (2900, 328)	0.45770938434332714
  (2900, 329)	-0.8277746749894674
  (2900, 330)	-0.2498556318719

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['MS.Zoning'] = data['MS.Zoning'].cat.remove_unused_categories()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['Sale.Type'] = data['Sale.Type'].cat.add_categories(['GroupedWD', 'Other'])
  data['Sale.Type'] = data['Sale.Type'].replace(['WD ', 'CWD', 'VWD'], 'GroupedWD')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retu