In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("2025_Airbnb_NYC_listings_for_modeling.csv", encoding='utf-8')
df.columns

Index(['neighbourhood_group_cleansed', 'neighbourhood_cleansed', 'latitude',
       'longitude', 'dist_from_center(km)', 'room_type', 'accommodates',
       'bedrooms', 'beds', 'bathrooms_cleansed', 'price($)',
       'minimum_nights_cleansed', 'luxury_amenities_cnt',
       'service_amenities_cnt', 'design_amenities_cnt',
       'essential_amenities_cnt', 'estimated_occupancy'],
      dtype='object')

In [5]:
import pandas as pd
import numpy as np

from catboost import Pool
import optuna

from sklearn.model_selection import train_test_split

# Train-test split
X = df.drop(columns=["price($)"])[:int(len(df)*0.9)]
X_test = df.drop(columns=["price($)"])[int(len(df)*0.9):]
y = np.log1p(df['price($)'])[:int(len(df)*0.9)]
y_test = np.log1p(df['price($)'])[int(len(df)*0.9):]

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

# Identify categorical features
cat_cols = X_train.select_dtypes(
    include=["object", "category"]
).columns.tolist()

cat_features = [X_train.columns.get_loc(col) for col in cat_cols]

# Create CatBoost Pools
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_features
)

val_pool = Pool(
    data=X_val,
    label=y_val,
    cat_features=cat_features
)

from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Define the objective function for Optuna
def objective(trial):

    params = {
        "iterations": 2000,
        "learning_rate": trial.suggest_float(
            "learning_rate", 0.01, 0.3, log=True
        ),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_float(
            "l2_leaf_reg", 1.0, 10.0, log=True
        ),
        "bagging_temperature": trial.suggest_float(
            "bagging_temperature", 0.0, 1.0
        ),
        "random_strength": trial.suggest_float(
            "random_strength", 0.0, 1.0
        ),
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "random_seed": 42,
        "verbose": False
    }

    model = CatBoostRegressor(**params)

    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=100,
        verbose=False
    )

    preds = model.predict(val_pool)
    rmse = float(np.sqrt(mean_squared_error(y_val, preds)))
    
    return rmse  # minimize

# Optimize hyperparameters using Optuna
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

print("Best RMSE:", study.best_value)
print("Best Params:", study.best_params)

best_params = {'learning_rate' : 0.0179,
    'depth':  9,
    'l2_leaf_reg' : 1.6818,
    'bagging_temperature':  0.053,
    'random_strength': 0.2266 }

# Train the final model with the best hyperparameters
final_model = CatBoostRegressor(
    **best_params,
    iterations=3000,
    loss_function="RMSE",
    eval_metric="RMSE",
    random_seed=42
)

final_model.fit(
    train_pool,
    eval_set=val_pool,
    early_stopping_rounds=100,
    verbose=100
)


[I 2025-12-29 15:01:49,641] A new study created in memory with name: no-name-92c1e095-0ec2-4d14-b6bd-588472c77f3a
[I 2025-12-29 15:02:02,260] Trial 0 finished with value: 0.358845087607268 and parameters: {'learning_rate': 0.2165127435875816, 'depth': 10, 'l2_leaf_reg': 1.1862189090982573, 'bagging_temperature': 0.7331877788633671, 'random_strength': 0.8610370434778359}. Best is trial 0 with value: 0.358845087607268.
[I 2025-12-29 15:02:45,355] Trial 1 finished with value: 0.3605870861117234 and parameters: {'learning_rate': 0.014691411392438665, 'depth': 7, 'l2_leaf_reg': 9.931603202077106, 'bagging_temperature': 0.19501388811041287, 'random_strength': 0.18708610975898377}. Best is trial 0 with value: 0.358845087607268.
[I 2025-12-29 15:03:18,843] Trial 2 finished with value: 0.3540311169975253 and parameters: {'learning_rate': 0.04601654621081967, 'depth': 7, 'l2_leaf_reg': 3.757823119384694, 'bagging_temperature': 0.24186887372998633, 'random_strength': 0.3181679514443869}. Best is 

Best RMSE: 0.3519813333281546
Best Params: {'learning_rate': 0.020514335327978993, 'depth': 9, 'l2_leaf_reg': 2.0161588102923163, 'bagging_temperature': 0.7298000442955633, 'random_strength': 0.653820102935922}
0:	learn: 0.6211791	test: 0.6289332	best: 0.6289332 (0)	total: 27.7ms	remaining: 1m 23s
100:	learn: 0.3704147	test: 0.3986818	best: 0.3986818 (100)	total: 3.07s	remaining: 1m 28s
200:	learn: 0.3345791	test: 0.3735261	best: 0.3735261 (200)	total: 5.75s	remaining: 1m 20s
300:	learn: 0.3204508	test: 0.3666533	best: 0.3666533 (300)	total: 8.53s	remaining: 1m 16s
400:	learn: 0.3114107	test: 0.3634280	best: 0.3634280 (400)	total: 11.5s	remaining: 1m 14s
500:	learn: 0.3040449	test: 0.3615021	best: 0.3615021 (500)	total: 14.5s	remaining: 1m 12s
600:	learn: 0.2969129	test: 0.3599084	best: 0.3599036 (599)	total: 17.7s	remaining: 1m 10s
700:	learn: 0.2899686	test: 0.3586198	best: 0.3586198 (700)	total: 20.7s	remaining: 1m 7s
800:	learn: 0.2837339	test: 0.3578277	best: 0.3578277 (800)	total

<catboost.core.CatBoostRegressor at 0x1bd00225050>

In [None]:
# Evaluate the final model on the test set
y_pred_cat = final_model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define evaluation function
def evaluate(y_true, y_pred):
    return {
        'MAE': round(mean_absolute_error(y_true, y_pred),4),
        'MSE' : round(mean_squared_error(y_true, y_pred),4),
        'RMSE': round(float(np.sqrt(mean_squared_error(y_true, y_pred))),4),
        'R2': round(r2_score(y_true, y_pred),4)
    }

cat_eval = evaluate(y_test, y_pred_cat)
print("CatBoost Evaluation:", cat_eval)

CatBoost Evaluation: {'MAE': 0.2634, 'MSE': 0.1101, 'RMSE': 0.3318, 'R2': 0.7759}


In [31]:
for k,v in best_params.items():
    print(f'{k}: {round(v,4)}')

learning_rate: 0.0179
depth: 9
l2_leaf_reg: 1.6818
bagging_temperature: 0.053
random_strength: 0.2266


In [None]:
# Save the final model
import joblib
joblib.dump(final_model, "model.pkl")

model_path = "catboost_model.cbm"

final_model.save_model(model_path)
