In [None]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

df_train['SalePrice'] = np.log1p(df_train['SalePrice'])

cols_ingredientes = ['TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'FullBath', 'HalfBath', 'BsmtFullBath', 'BsmtHalfBath']
imputer_ingredientes = SimpleImputer(strategy='median')

y = df_train['SalePrice']
X = df_train.drop('SalePrice', axis=1)

imputer_ingredientes.fit(X[cols_ingredientes])

X[cols_ingredientes] = imputer_ingredientes.transform(X[cols_ingredientes])
df_test[cols_ingredientes] = imputer_ingredientes.transform(df_test[cols_ingredientes])

X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
df_test['TotalSF'] = df_test['TotalBsmtSF'] + df_test['1stFlrSF'] + df_test['2ndFlrSF']

X['TotalBathrooms'] = X['FullBath'] + (0.5 * X['HalfBath']) + X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath'])
df_test['TotalBathrooms'] = df_test['FullBath'] + (0.5 * df_test['HalfBath']) + df_test['BsmtFullBath'] + (0.5 * df_test['BsmtHalfBath'])

colunas_categoricas_nominais = ['MSZoning', 'LandContour', 'Street', 'Utilities', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'Functional', 'GarageType', 'PavedDrive', 'SaleType', 'SaleCondition']
colunas_categoricas_ordinais = ['LotShape', 'OverallCond', 'OverallQual', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond']

colunas_numericas_df = X.select_dtypes(include=['int64', 'float64'])
colunas_a_remover_num = ['Id', 'OverallQual', 'OverallCond'] + cols_ingredientes
colunas_numericas_df = colunas_numericas_df.drop(columns=colunas_a_remover_num, axis=1, errors='ignore')

lista_colunas_numericas = colunas_numericas_df.columns.tolist()

if 'TotalSF' not in lista_colunas_numericas:
    lista_colunas_numericas.append('TotalSF')
if 'TotalBathrooms' not in lista_colunas_numericas:
    lista_colunas_numericas.append('TotalBathrooms')

In [None]:
lotshape_order = ['Reg', 'IR1', 'IR2', 'IR3']
overallcond_order = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
overallqual_order = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
landslope_order = ['Gtl', 'Mod', 'Sev']
exterqual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
extercond_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmtqual_order = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmtcond_order = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
bsmtexposure_order = ['NA', 'No', 'Mn', 'Av', 'Gd']
bsmtfintype1_order = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
bsmtfintype2_order = ['NA', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
heatingqc_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
kitchenqual_order = ['Po', 'Fa', 'TA', 'Gd', 'Ex']
fireplacequ_order = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garagefinish_order = ['NA', 'Unf', 'RFn', 'Fin']
garagequal_order = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']
garagecond_order = ['NA', 'Po', 'Fa', 'TA', 'Gd', 'Ex']

lista_de_ordens = [lotshape_order, overallcond_order, overallqual_order, landslope_order, exterqual_order, extercond_order, bsmtqual_order, bsmtcond_order, bsmtexposure_order, bsmtfintype1_order, bsmtfintype2_order,
                 heatingqc_order, kitchenqual_order, fireplacequ_order, garagefinish_order, garagequal_order, garagecond_order]

In [None]:
pipeline_numerica = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
pipeline_categorica_nominal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
pipeline_categorica_ordinal = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='NA')),
    # O 'use_encoded_value' faz com que não ocorra um erro quando um valor novo aparecer durante a execução
    # dos X_valid, e esse valor será substituido pelo unknown_value, que é '-1'
    ('ordinal', OrdinalEncoder(categories=lista_de_ordens, handle_unknown='use_encoded_value', unknown_value=-1))
])

pre_processador = ColumnTransformer(
    transformers=[
        ('num', pipeline_numerica, lista_colunas_numericas),
        ('cat_nom', pipeline_categorica_nominal, colunas_categoricas_nominais),
        ('cat_ord', pipeline_categorica_ordinal, colunas_categoricas_ordinais)
    ],
    remainder='drop'
)

params_campeoes = {
    'subsample': 0.7,
    'n_estimators': 2000,
    'max_depth': 3,
    'learning_rate': 0.01,
    'colsample_bytree': 0.7
}

pipeline_base_xgb = Pipeline(steps=[
    ('preprocessor', pre_processador),
    ('classifier', XGBRegressor(
        **params_campeoes,
        tree_method='hist',
        random_state=42
    ))
])

In [None]:
pipeline_base_xgb.fit(X, y)

previsoes_finais = pipeline_base_xgb.predict(df_test)

previsoes_em_dolar = np.expm1(previsoes_finais)

NOME_ARQUIVO_FINAL = 'submission.csv'

new_df = {'Id': df_test['Id'], 'SalePrice': previsoes_em_dolar}
submission_df = pd.DataFrame(new_df)
submission_df.to_csv(NOME_ARQUIVO_FINAL, index=False)

print(f"Arquivo '{NOME_ARQUIVO_FINAL}' gerado com sucesso.")