In [1]:
import pandas as pd
df = pd.read_csv('realtor-data.zip.csv')
df_clean = df.drop(columns=["brokered_by", "street", "prev_sold_date"])
df_clean = df_clean.dropna(axis=0, how="any")

df_clean = df_clean[df_clean["price"] > 1000]  # e.g., ignore anything below $10k

price_min = df_clean["price"].min()
price_max = df_clean["price"].max()

print(f"Min: {price_min}")
print(f"Max: {price_max}")

yVal = df_clean["price"]

xVal =  df_clean.drop(columns=["price"])

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    xVal, yVal, test_size=0.2, random_state=42
)

categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical features:", categorical_features)

train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

model = CatBoostRegressor(
    iterations=420,        # Number of trees
    depth=8,                # Tree depth
    learning_rate=0.05,      # Step size
    loss_function='RMSE',   # Regression loss
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

# Train
model.fit(train_pool, eval_set=test_pool, use_best_model=True)

preds = model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

print("RMSE:", mse**0.5)
print("R2 score:", r2)

# Save model to file
model.save_model("catboost_price_model.cbm")

Min: 1111.0
Max: 515000000.0
Categorical features: ['status', 'city', 'state']
0:	learn: 1220685.3815629	test: 1151883.8369732	best: 1151883.8369732 (0)	total: 455ms	remaining: 3m 10s
100:	learn: 888227.4600391	test: 794414.3018847	best: 794414.3018847 (100)	total: 22.5s	remaining: 1m 11s
200:	learn: 847820.0145074	test: 756941.4858153	best: 756941.4858153 (200)	total: 49.3s	remaining: 53.7s
300:	learn: 826611.5674157	test: 739282.6644691	best: 739282.6644691 (300)	total: 1m 21s	remaining: 32.3s
400:	learn: 811145.4833053	test: 729723.1939258	best: 729723.1939258 (400)	total: 1m 55s	remaining: 5.47s
419:	learn: 807781.6866899	test: 728758.5578982	best: 728571.0555477 (413)	total: 1m 59s	remaining: 0us

bestTest = 728571.0555
bestIteration = 413

Shrink model to first 414 iterations.
RMSE: 728571.0555477351
R2 score: 0.6133312668112134


In [2]:
print(df_clean.columns.tolist())

print(yVal.unique())

['status', 'price', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code', 'house_size']
[105000.  80000.  67000. ... 386464. 488512. 280290.]
