In [1]:
import pandas as pd
df = pd.read_csv('realtor-data.zip.csv')
df_clean = df.drop(columns=["brokered_by", "street", "prev_sold_date"])
df_clean = df_clean.dropna(axis=0, how="any")

# 10 equal-width bins
df_clean["price_bin_equal_width"] = pd.cut(df_clean["price"], bins=100, labels=False)

df_clean.drop(columns=["price"], inplace= True)

yVal = df_clean["price_bin_equal_width"]

xVal =  df_clean.drop(columns=["price_bin_equal_width"])

from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    xVal, yVal, test_size=0.2, random_state=42
)

categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()
print("Categorical features:", categorical_features)

train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features)

model = CatBoostRegressor(
    iterations=420,        # Number of trees
    depth=8,                # Tree depth
    learning_rate=0.05,      # Step size
    loss_function='RMSE',   # Regression loss
    eval_metric='RMSE',
    random_seed=42,
    verbose=100
)

# Train
model.fit(train_pool, eval_set=test_pool, use_best_model=True)

preds = model.predict(X_test)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, preds)
r2 = r2_score(y_test, preds)

print("RMSE:", mse**0.5)
print("R2 score:", r2)

# Save model to file
model.save_model("catboost_price_model.cbm")

Categorical features: ['status', 'city', 'state']
0:	learn: 0.2044285	test: 0.1634535	best: 0.1634535 (0)	total: 731ms	remaining: 5m 6s
100:	learn: 0.1594282	test: 0.1203414	best: 0.1203414 (100)	total: 30.9s	remaining: 1m 37s
200:	learn: 0.1531538	test: 0.1170388	best: 0.1170388 (200)	total: 1m 1s	remaining: 1m 7s
300:	learn: 0.1472035	test: 0.1143170	best: 0.1143037 (299)	total: 1m 27s	remaining: 34.4s
400:	learn: 0.1442421	test: 0.1135449	best: 0.1135340 (398)	total: 1m 57s	remaining: 5.57s
419:	learn: 0.1432292	test: 0.1135008	best: 0.1135008 (419)	total: 2m 1s	remaining: 0us

bestTest = 0.1135008156
bestIteration = 419

RMSE: 0.11350081564849655
R2 score: 0.5286848511230314


In [2]:
print(df_clean.columns.tolist())


['status', 'bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code', 'house_size', 'price_bin_equal_width']
