In [1]:
import numpy as np
import pandas as pd

from func import *

In [2]:
train_df = load_csvs_from_ftp_to_df(provided_local_dir="/")

In [3]:
positive_col_list = ["BedroomsTotal",
                     "BathroomsTotalInteger",
                     "LotSizeAcres",
                     "LotSizeArea",
                     "LotSizeSquareFeet",
                     "LivingArea"]
non_negative_col_list = ["ParkingTotal"]
default_drop_col = ["ListingId",
                    "ListingKey",
                    "ListingKeyNumeric",
                    "ListPrice",
                    "OriginalListPrice"]

In [4]:
(train_df_clean,
 knn_model,
 train_df_ref,
 reference_col_list,
 k_means_model,
 col_drop_list,
 scaler,
 cols_with_na,
 train_df_cluster_ref,
     save_name) = pre_process(train_df,
                              price_col="ClosePrice",
                              default_threshold=0.8,
                              col_drop=default_drop_col,
                              positive_col_list=positive_col_list,
                              non_negative_col_list=non_negative_col_list,
                              flag_col_list=["Flooring"],
                              yn_col_list=["AttachedGarageYN",
                                           "ViewYN",
                                           "NewConstructionYN",
                                           "PoolPrivateYN",
                                           "FireplaceYN"],
                              knn_k=3,
                              knn_model=None,
                              train_df_ref=None,
                              reference_col_list=None,
                              num_clusters=10,
                              clustering_method="k-means",
                              clustering_model=None,
                              train_df_cluster_ref=None,
                              scaler_method="robust",
                              scaler=None,
                              save_name="processed",
                              train_data=True,
                              save=True,
                              cols_with_na=None
                              )

  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df_clean[yn_col_list] = df_clean[yn_col_list].fillna(False)


In [5]:
test_df = load_csvs_from_ftp_to_df(provided_local_dir="/",
                                   date_range=range(12, 13))
col_drop_list.extend(default_drop_col)

In [6]:
test_df_clean, _, _, _, _, _, _, _, _, _ = pre_process(test_df,
                                                 price_col="ClosePrice",
                                                 default_threshold=0.8,
                                                 col_drop=col_drop_list,
                                                 positive_col_list=positive_col_list,
                                                 non_negative_col_list=non_negative_col_list,
                                                 flag_col_list=["Flooring"],
                                                 yn_col_list=["AttachedGarageYN",
                                                              "ViewYN",
                                                              "NewConstructionYN",
                                                               "PoolPrivateYN",
                                                               "FireplaceYN"],
                                                 knn_k=3,
                                                 knn_model=knn_model,
                                                 train_df_ref=train_df_ref,
                                                 reference_col_list=reference_col_list,
                                                 num_clusters=10,
                                                 clustering_method="k-means",
                                                 clustering_model=k_means_model,
                                                train_df_cluster_ref=train_df_cluster_ref,
                                                 scaler_method="robust",
                                                 scaler=scaler,
                                                 save_name=save_name,
                                                 train_data=False,
                                                 save=True,
                                                 cols_with_na=cols_with_na
                                                 )

  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df[val + "YN"] = df[val + "YN"].fillna(False)
  df_clean[yn_col_list] = df_clean[yn_col_list].fillna(False)


# Read Processed data

In [11]:
train_df = pd.read_csv("processed5/train_data.csv")
test_df = pd.read_csv("processed5/test_data.csv")

# Train model (Ridge regression)

In [12]:
train_df["logClosePrice"] = np.log1p(train_df["ClosePrice"])
train_df.drop(columns=["ClosePrice"], inplace=True)
test_df["logClosePrice"] = np.log1p(test_df["ClosePrice"])
test_df.drop(columns=["ClosePrice"], inplace=True)

In [13]:
from sklearn.linear_model import Ridge

model = Ridge(alpha=10.0, random_state=42)

ridge_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)
print(f"R²: {ridge_result['r2']:.4f}")
print(f"MdAPE (%): {ridge_result['mdape']:.2f}")

R²: 0.1575
MdAPE (%): 31.97


# Elastic Net (Ridge + Lasso hybrid)

In [14]:
from sklearn.linear_model import ElasticNet

model = ElasticNet(alpha=0.01, l1_ratio=0.5)

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.7476
MdAPE (%): 11.96


# Histogram-based Gradient Boosting Regression Tree

In [15]:
from sklearn.ensemble import HistGradientBoostingRegressor

model = HistGradientBoostingRegressor(
    max_depth=6,
    learning_rate=0.05,
    max_iter=300,
    random_state=42,
)


hist_tree_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {hist_tree_result['r2']:.4f}")
print(f"MdAPE (%): {hist_tree_result['mdape']:.2f}")

R²: -0.0077
MdAPE (%): 38.01


# Random forest

In [16]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    random_state=42,
    n_jobs=-1
)

rf_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {rf_result['r2']:.4f}")
print(f"MdAPE (%): {rf_result['mdape']:.2f}")

R²: 0.0219
MdAPE (%): 37.94


# Multi-layer Perceptron

In [17]:
from sklearn.neural_network import MLPRegressor

model = MLPRegressor(
    hidden_layer_sizes=(128, 64),
    activation="relu",
    alpha=1e-3,
    learning_rate_init=1e-3,
    max_iter=300,
    early_stopping=True,
    random_state=42
)

mlp_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {mlp_result['r2']:.4f}")
print(f"MdAPE (%): {mlp_result['mdape']:.2f}")

R²: 0.8235
MdAPE (%): 10.44


# Tuning version

## Elastic Net

In [18]:

param_grid = {
    "model__alpha": np.logspace(-4, 2, 25),
    "model__l1_ratio": np.linspace(0.05, 0.95, 19),
}

res = grid_tune_with_make_model_pipeline(
    train_df=train_df,
    target_col="logClosePrice",
    model=ElasticNet(max_iter=20000),
    param_grid=param_grid,
    col_drop_list=["ClosePrice"],
    card_threshold=20,
    scoring="r2",
    cv=5
)

best_pipe = res["best_pipeline"]
print(res["best_score"], res["best_params"])

Fitting 5 folds for each of 475 candidates, totalling 2375 fits
0.8627505332713833 {'model__alpha': 0.01778279410038923, 'model__l1_ratio': 0.05}


In [19]:
model = ElasticNet(alpha=res["best_params"]["model__alpha"], l1_ratio=res["best_params"]["model__l1_ratio"])

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.7519
MdAPE (%): 11.90


## MLP

In [25]:
param_grid_mlp_stable = {
    "model__hidden_layer_sizes": [(128, 64), (256, 128), (256, 128, 64)],
    "model__alpha": [1e-5, 1e-4, 1e-3],
    "model__learning_rate_init": [1e-4, 5e-4, 1e-3],
}

mlp = MLPRegressor(
    activation="relu",
    max_iter=300,
    early_stopping=True,
    random_state=42,
)

res = grid_tune_with_make_model_pipeline(
    train_df=train_df,
    target_col="logClosePrice",
    model=mlp,
    param_grid=param_grid_mlp_stable,
    col_drop_list=["ClosePrice"],
    card_threshold=20,
    scoring="r2",
    cv=3,
    n_jobs=-1,
)

print(res["best_score"], res["best_params"])

Fitting 3 folds for each of 27 candidates, totalling 81 fits
0.879909759161056 {'model__alpha': 0.0001, 'model__hidden_layer_sizes': (256, 128), 'model__learning_rate_init': 0.0001}


In [26]:
model = MLPRegressor(
    hidden_layer_sizes=res["best_params"]["model__hidden_layer_sizes"],
    activation="relu",
    alpha=res["best_params"]["model__alpha"],
    learning_rate_init=res["best_params"]["model__learning_rate_init"],
    max_iter=300,
    early_stopping=True,
    random_state=42
)

elastic_net_result = fit_predict(
    train_df=train_df,
    test_df=test_df,
    model=model,
    col_drop_list=["ClosePrice"],
    target_col="logClosePrice",
    card_threshold=20,
    num_scaler="robust",
    smoothing=10,
    min_samples_leaf=20,
    log_transform=True
)

print(f"R²: {elastic_net_result['r2']:.4f}")
print(f"MdAPE (%): {elastic_net_result['mdape']:.2f}")

R²: 0.8120
MdAPE (%): 10.63
