In [3]:
import pandas as pd
import os

# Download latest version
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")


In [4]:
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [16]:
df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [5]:
y = df_train["SalePrice"]
x = df_train.drop(columns=["SalePrice", "Id"])
x_test = df_test.drop(columns=["Id"])

In [6]:
import numpy as np
y_log = np.log1p(y)

In [7]:
num_features = x.select_dtypes(include=["int64", "float64"]).columns

In [8]:
print(num_features)

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')


In [9]:
cat_features = x.select_dtypes(include=["object"]).columns
print(cat_features)

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')


In [15]:
from sklearn.compose import ColumnTransformer

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

In [17]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
    ("scaler", StandardScaler()),
])
cat_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = "ignore")),
])

In [18]:
preprocess = ColumnTransformer([
    ("num", numeric_transformer, num_features),
    ("cat", cat_transformer, cat_features),
])

In [19]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [20]:
x_train, x_eval, y_train, y_eval = train_test_split(
    x, y_log, test_size = 0.2, random_state = 42)

In [28]:
model = Ridge(alpha=10.0)

In [29]:
pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", model),
])


In [30]:
pipe.fit(x_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,alpha,10.0
,fit_intercept,True
,copy_X,True
,max_iter,
,tol,0.0001
,solver,'auto'
,positive,False
,random_state,


In [31]:
y_eval_pred = pipe.predict(x_eval)

In [32]:
mse = mean_squared_error(y_eval, y_eval_pred)

In [33]:
print(np.sqrt(mse))

0.13610261291763354


In [34]:
from sklearn.tree import DecisionTreeRegressor

In [35]:
num_transformer_tree = Pipeline([
    ("imputer", SimpleImputer(strategy = "median")),
])

In [36]:
cat_transformer_tree = Pipeline([
    ("imputer", SimpleImputer(strategy = "most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown = "ignore")),
])

In [37]:
process_tree = ColumnTransformer([
    ("num", num_transformer_tree, num_features),
    ("cat", cat_transformer_tree, cat_features),
])


In [38]:
tree = DecisionTreeRegressor(
    random_state = 42,
    max_depth = 10,
    min_samples_leaf = 5,
)

In [39]:
pipe_tree = Pipeline([
    ("preprocess", process_tree),
    ("model", tree),
])

In [40]:
pipe_tree.fit(x_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,10
,min_samples_split,2
,min_samples_leaf,5
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [41]:
tree_predict = pipe_tree.predict(x_eval)

In [42]:
rmse_tree = np.sqrt(mean_squared_error(y_eval, tree_predict))

In [43]:
rmse_tree

np.float64(0.188392453746671)

In [44]:
from sklearn.ensemble import RandomForestRegressor

In [45]:
rf = RandomForestRegressor(
    n_estimators = 600,
    random_state = 42,
    n_jobs = -1,
    max_depth = None,
    min_samples_leaf = 1,
    max_features="sqrt",
)

In [46]:
pipe_rf = Pipeline([
    ("preprocess", process_tree),
    ("model", rf)
])

In [47]:
pipe_rf.fit(x_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,600
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [48]:
y_predict_rf = pipe_rf.predict(x_eval)

In [49]:
rmse_rf = np.sqrt(mean_squared_error(y_eval, y_predict_rf))

In [50]:
rmse_rf

np.float64(0.15920712327555045)

In [51]:
from sklearn.ensemble import GradientBoostingRegressor

In [52]:
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42,
)

In [53]:
pipe_gbr = Pipeline([
    ("preprocess", process_tree),
    ("model", gbr),
])

In [54]:
pipe_gbr.fit(x_train, y_train)

0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'squared_error'
,learning_rate,0.05
,n_estimators,300
,subsample,1.0
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [55]:
y_gbr = pipe_gbr.predict(x_eval)

In [56]:
rmse_rf

np.float64(0.15920712327555045)

In [57]:
%pip install -U xgboost

Note: you may need to restart the kernel to use updated packages.


In [58]:
import xgboost
xgboost.__version__

'3.1.3'

In [59]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
xgb_model = XGBRegressor(
    n_estimators=2000,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_lambda=1.0,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,
)

pipe_xgb = Pipeline([
    ("preprocess", process_tree),
    ("model", xgb_model),
])

pipe_xgb.fit(x_train, y_train)
pred = pipe_xgb.predict(x_eval)
rmse_xgb = np.sqrt(mean_squared_error(y_eval, pred))
rmse_xgb

np.float64(0.13207364925786122)

In [63]:
import numpy as np
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from xgboost.callback import EarlyStopping

Xtr = process_tree.fit_transform(x_train)
Xva = process_tree.transform(x_eval)

xgb_es = XGBRegressor(
    n_estimators=20000,
    learning_rate=0.03,
    max_depth=4,
    min_child_weight=1,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.0,
    reg_lambda=1.0,
    objective="reg:squarederror",
    eval_metric="rmse",
    tree_method="hist",
    random_state=42,
    n_jobs=-1,

    early_stopping_rounds=200,
)

xgb_es.fit(
    Xtr, y_train,
    eval_set=[(Xva, y_eval)],
    verbose=False
)

pred = xgb_es.predict(Xva)
rmse = np.sqrt(mean_squared_error(y_eval, pred))
rmse, xgb_es.best_iteration



(np.float64(0.1297966410009719), 663)

In [12]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pre_mlp = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

cat_pre_mlp = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

process_mlp = ColumnTransformer([
    ("num", num_pre_mlp, num_features),
    ("cat", cat_pre_mlp, cat_features),
])

In [13]:
import numpy as np
import torch

In [21]:
Xtr = process_mlp.fit_transform(x_train)
type(Xtr)

scipy.sparse._csr.csr_matrix

In [24]:
Xva = process_mlp.transform(x_eval)
type(Xva)

scipy.sparse._csr.csr_matrix

In [25]:
Xtr = Xtr.toarray() if hasattr(Xtr, "toarray") else Xtr
Xva = Xva.toarray() if hasattr(Xva, "toarray") else Xva

Xtr_t = torch.tensor(Xtr, dtype=torch.float32)
ytr_t = torch.tensor(y_train.values if hasattr(y_train, "values") else y_train, dtype=torch.float32).view(-1, 1)

Xva_t = torch.tensor(Xva, dtype=torch.float32)
yva_t = torch.tensor(y_eval.values if hasattr(y_eval, "values") else y_eval, dtype=torch.float32).view(-1, 1)

Xtr_t.shape, ytr_t.shape

(torch.Size([1168, 285]), torch.Size([1168, 1]))

In [28]:
import torch.nn as nn

class MLPRegressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.2),

            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.1),

            nn.Linear(128, 1),
        )

    def forward(self, x):
        return self.net(x)


In [31]:
def rmse(y_true, y_pred):
    return torch.sqrt(torch.mean((y_true - y_pred) ** 2)).item()

train_loader = DataLoader(TensorDataset(Xtr_t, ytr_t), batch_size=64, shuffle=True)

best_val = float("inf")
best_state = None

optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-3)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5
)

for epoch in range(1, 101):
    model.train()
    for xb, yb in train_loader:
        pred = model(xb)
        loss = loss_fn(pred, yb)
        optimizer.zero_grad()

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    model.eval()
    with torch.no_grad():
        val_pred = model(Xva_t)
        val_rmse = rmse(yva_t, val_pred)

    if val_rmse < best_val:
        best_val = val_rmse
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

    if epoch % 5 == 0 or epoch == 1:
        print(f"Epoch {epoch:02d} | val RMSE: {val_rmse:.5f} | best: {best_val:.5f}")

    scheduler.step(val_rmse)
    

# restore best
model.load_state_dict(best_state)
best_val


Epoch 01 | val RMSE: 0.23390 | best: 0.23390
Epoch 05 | val RMSE: 0.19919 | best: 0.18571
Epoch 10 | val RMSE: 0.20511 | best: 0.18571
Epoch 15 | val RMSE: 0.21200 | best: 0.17030
Epoch 20 | val RMSE: 0.20312 | best: 0.16070
Epoch 25 | val RMSE: 0.20150 | best: 0.16070
Epoch 30 | val RMSE: 0.17380 | best: 0.15889
Epoch 35 | val RMSE: 0.15138 | best: 0.14709
Epoch 40 | val RMSE: 0.22404 | best: 0.14392
Epoch 45 | val RMSE: 0.14789 | best: 0.14392
Epoch 50 | val RMSE: 0.16614 | best: 0.14392
Epoch 55 | val RMSE: 0.14956 | best: 0.14392
Epoch 60 | val RMSE: 0.16448 | best: 0.14392
Epoch 65 | val RMSE: 0.16102 | best: 0.14392
Epoch 70 | val RMSE: 0.15843 | best: 0.14392
Epoch 75 | val RMSE: 0.16210 | best: 0.14392
Epoch 80 | val RMSE: 0.16563 | best: 0.14392
Epoch 85 | val RMSE: 0.16577 | best: 0.14392
Epoch 90 | val RMSE: 0.16555 | best: 0.14392
Epoch 95 | val RMSE: 0.16487 | best: 0.14392
Epoch 100 | val RMSE: 0.16439 | best: 0.14392


0.143915593624115