In [1]:
import pandas as pd
import numpy as np

In [9]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

X, y = train_df.drop(columns=["SalePrice"]), train_df["SalePrice"]

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import root_mean_squared_error, mean_squared_error
# Select numerical and categorical columns
numerical_attributes = X_train.select_dtypes(include=[np.number]).columns
categorical_attributes = X_train.select_dtypes(include=["object"]).columns

# Pipelines
numerical_pipeline = Pipeline([
    ('std_scaler', StandardScaler()),
    ('imputer', SimpleImputer(strategy="median"))
])

categorical_pipeline = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ('imputer', SimpleImputer(strategy="most_frequent" \
    ""))
])

# Full preprocessing pipeline
full_pipeline = ColumnTransformer(transformers=[
    ('num', numerical_pipeline, numerical_attributes),
    ('cat', categorical_pipeline, categorical_attributes)
])

# Prepare data (fit on train, transform train and test)

X_train_prepared = full_pipeline.fit_transform(X_train)
X_test_prepared = full_pipeline.transform(X_test)

# Full pipeline including model
model_pipeline = Pipeline([
    ('preprocessor', full_pipeline),
    ('model', XGBRegressor(
        n_estimators=330,
        max_depth=10,
        random_state=42
    ))
])

# Fit the model
model_pipeline.fit(X_train, y_train)

# Predict
y_pred = model_pipeline.predict(X_test)

print(f"RMSE: {mean_squared_error(y_pred, y_test)}, RMSE: {root_mean_squared_error(y_pred, y_test)}")

RMSE: 764340224.0, RMSE: 27646.703125
