In [1]:
from func import *

In [2]:
train_df = pd.read_csv("train_df_clean.csv")
test_df = pd.read_csv("test_df_clean.csv")

In [3]:
data_quality_summary(train_df)

Unnamed: 0,column,dtype,n_unique,missing_%,num_missing
22,SubdivisionName,object,5592,65.23,42738
23,BuyerOfficeAOR,object,59,5.9,3865
6,ListAgentEmail,object,30616,0.35,228
25,StreetNumberNumeric,float64,23252,0.08,55
11,UnparsedAddress,object,65226,0.06,41
0,BuyerAgentAOR,object,52,0.02,11
1,ListAgentAOR,object,52,0.02,11
41,log_ClosePrice,float64,7373,0.0,0
33,LotSizeArea,float64,14538,0.0,0
34,MainLevelBedrooms,float64,22,0.0,0


In [4]:
train_df = train_df.drop(["HighSchoolDistrict_target_mean", "Flooring_target_mean", "LivingArea_std", "LotSizeSquareFeet_std", "AssociationFee_std"], axis=1)

In [5]:
no_missing_train_df = train_df.dropna(axis=1)

In [6]:
data_quality_summary(test_df)

Unnamed: 0,column,dtype,n_unique,missing_%,num_missing
22,SubdivisionName,object,1708,66.25,6618
23,BuyerOfficeAOR,object,55,9.27,926
6,ListAgentEmail,object,7744,0.26,26
25,StreetNumberNumeric,float64,6917,0.16,16
11,UnparsedAddress,object,9977,0.11,11
0,BuyerAgentAOR,object,52,0.0,0
42,ZIP_prefix,int64,57,0.0,0
33,LotSizeArea,float64,4670,0.0,0
34,MainLevelBedrooms,float64,14,0.0,0
35,NewConstructionYN,bool,2,0.0,0


In [7]:
test_df = test_df.drop(["HighSchoolDistrict_target_mean", "Flooring_target_mean", "LivingArea_std", "LotSizeSquareFeet_std", "AssociationFee_std"], axis=1)

In [8]:
no_missing_test_df = test_df.dropna(axis=1)

In [9]:
train_df = no_missing_train_df
test_df = no_missing_test_df

In [10]:
train_df["log_ClosePrice"] = np.log1p(train_df["ClosePrice"])
train_df.drop(columns=["ClosePrice"], inplace=True)
test_df["log_ClosePrice"] = np.log1p(test_df["ClosePrice"])
test_df.drop(columns=["ClosePrice"], inplace=True)

# Ridge

In [11]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=10.0, random_state=42)

ridge_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)
print(f"R²: {ridge_result['r2']:.4f}")
print(f"MdAPE (%): {ridge_result['mdape']:.2f}")

R²: 0.1202
MdAPE (%): 33.88


# ElasticNet

In [12]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.01, l1_ratio=0.5)

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.7535
MdAPE (%): 12.17


# Histogram-based Gradient Boosting Regression Tree

In [13]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(
    max_depth=6,
    learning_rate=0.05,
    max_iter=300,
    random_state=42,
)


hist_tree_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {hist_tree_result['r2']:.4f}")
print(f"MdAPE (%): {hist_tree_result['mdape']:.2f}")

R²: -0.0359
MdAPE (%): 39.28


# Random forest

In [14]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {rf_result['r2']:.4f}")
print(f"MdAPE (%): {rf_result['mdape']:.2f}")

R²: -0.0360
MdAPE (%): 39.27


# Multi-layer Perceptron

In [15]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    alpha=1e-3,
    learning_rate_init=1e-3,
    max_iter=300,
    early_stopping=True,
    random_state=42
)

mlp_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {mlp_result['r2']:.4f}")
print(f"MdAPE (%): {mlp_result['mdape']:.2f}")

R²: 0.7983
MdAPE (%): 12.89


# Tuning version

## Elastic Net

In [16]:
param_grid = {
    "model__alpha": np.logspace(-4, 2, 25),
    "model__l1_ratio": np.linspace(0.05, 0.95, 19),
}

res = grid_tune_with_make_model_pipeline(
    train_df=train_df,
    target_col="log_ClosePrice",
    model=ElasticNet(max_iter=20000),
    param_grid=param_grid,
    col_drop_list=["ClosePrice"],
    card_threshold=20,
    scoring="r2",
    cv=5
)

best_pipe = res["best_pipeline"]
print(res["best_score"], res["best_params"])

Fitting 5 folds for each of 475 candidates, totalling 2375 fits
0.8628536369895269 {'model__alpha': 0.01778279410038923, 'model__l1_ratio': 0.05}


In [17]:
model = ElasticNet(alpha=res["best_params"]["model__alpha"], l1_ratio=res["best_params"]["model__l1_ratio"])

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.7555
MdAPE (%): 12.22


## MLP

In [18]:
param_grid_mlp_stable = {
    "model__hidden_layer_sizes": [(128, 64), (256, 128), (256, 128, 64)],
    "model__alpha": [1e-5, 1e-4, 1e-3],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
}

mlp = MLPRegressor(
    activation="relu",
    max_iter=300,
    early_stopping=True,
    random_state=42,
)

res = grid_tune_with_make_model_pipeline(
    train_df=train_df,
    target_col="log_ClosePrice",
    model=mlp,
    param_grid=param_grid_mlp_stable,
    col_drop_list=["ClosePrice"],
    card_threshold=20,
    scoring="r2",
    cv=3,
    n_jobs=-1,
)

print(res["best_score"], res["best_params"])

Fitting 3 folds for each of 27 candidates, totalling 81 fits
0.8897492586490818 {'model__alpha': 0.001, 'model__hidden_layer_sizes': (256, 128, 64), 'model__learning_rate_init': 0.001}


In [19]:
model = MLPRegressor(
    hidden_layer_sizes=res["best_params"]["model__hidden_layer_sizes"],
    activation="relu",
    alpha=res["best_params"]["model__alpha"],
    learning_rate_init=res["best_params"]["model__learning_rate_init"],
    max_iter=300,
    early_stopping=True,
    random_state=42
)

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="log_ClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.8076
MdAPE (%): 10.28
