In [None]:
import pandas as pd

train_data = pd.read_csv("../datasets/housing_prices/train.csv")
test_data = pd.read_csv("../datasets/housing_prices/test.csv")

In [None]:
train_data.describe()

In [None]:
train_data.head()

In [None]:
y = train_data.SalePrice
X = train_data.drop("SalePrice", axis=1)

In [None]:
numerical_columns = [c for c in X.columns if X[c].dtype in ['int64', 'float64']]
categorical_columns = [c for c in X.columns if X[c].nunique() < 10 and X[c].dtype == "object"]

X_train = X[numerical_columns + categorical_columns].copy()

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

In [None]:
model = XGBRegressor(
    n_estimator=500,
    learning_rate=0.1
)

# Preprocessing for numerical values
numerical_transformer = SimpleImputer(strategy="constant")

# Preprocessing for categorical values
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

In [None]:

param_grid = {
    "preprocessor__num__strategy": ["constant", "median"],
    "model__n_estimator": [500],
    "model__learning_rate": [0.1, 0.05, 0.01],
}

search = GridSearchCV(pipeline, param_grid, n_jobs=-1)
search.fit(X_train, y)

print("Best parameter (CV score: {%0.3f}:", search.best_score_)
print(search.best_params_)

In [None]:
model = XGBRegressor(
    n_estimator=500,
    learning_rate=0.05
)

# Preprocessing for numerical values
numerical_transformer = SimpleImputer(strategy="median")

# Preprocessing for categorical values
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(transformers=[
    ("num", numerical_transformer, numerical_columns),
    ("cat", categorical_transformer, categorical_columns)
])
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

pipeline.fit(X_train, y)

In [None]:
#X_test = test_data.drop("SalePrice", axis=1)
X_test = test_data[numerical_columns + categorical_columns].copy()

preds = pipeline.predict(X_test)

In [None]:
output = pd.DataFrame({'Id': X_test.Id, 'SalePrice': preds})

output.to_csv("submission.csv", index=False)