In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")

y = np.log1p(train["SalePrice"])
X = train.drop(["SalePrice", "Id"], axis=1)
X_test = test.drop(["Id"], axis=1)

num_cols = X.select_dtypes(include=["int64", "float64"]).columns
cat_cols = X.select_dtypes(include=["object"]).columns

num_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

cat_pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer([
    ("num", num_pipe, num_cols),
    ("cat", cat_pipe, cat_cols)
])

model = Pipeline([
    ("prep", preprocess),
    ("rf", RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

val_pred = model.predict(X_val)
rmsle = np.sqrt(mean_squared_log_error(y_val, val_pred))
print("Validation RMSLE:", rmsle)

test_pred = model.predict(X_test)
final_pred = np.expm1(test_pred)

submission = pd.DataFrame({
    "Id": test["Id"],
    "SalePrice": final_pred
})

submission.to_csv("/content/submission.csv", index=False)

from google.colab import files
files.download("/content/submission.csv")


Validation RMSLE: 0.011464017716461865


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>