# CatBoost Model

## Importing Libraries & Data

In [None]:
import numpy as np 
import pandas as pd 
import geopandas as gpd

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from catboost import CatBoostRegressor, Pool

import warnings
warnings.filterwarnings("ignore", category = RuntimeWarning)

In [4]:
listings = pd.read_csv(r"listings_cleaner.csv")

## Dropping & Renaming Columns

In [5]:
listings.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'accommodates', 'availability_30',
       'availability_365', 'bathrooms', 'bedrooms', 'beds',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'description',
       'has_availability', 'host_acceptance_rate', 'host_has_profile_pic',
       'host_id', 'host_identity_verified', 'host_is_superhost',
       'host_listings_count', 'host_neighbourhood', 'host_response_rate',
       'host_response_time', 'host_total_listings_count', 'id',
       'instant_bookable', 'last_scraped', 'latitude', 'longitude',
       'maximum_nights', 'minimum_nights', 'name',
       'neighbourhood_group_cleansed', 'number_of_reviews', 'price',
       'review_scores_accuracy', 'review_scores_checkin',
       'review_scores_cleanliness', 'review_scores_communication',
       'review_scores_location', 'review_scores_rating', '

In [8]:
# Listings preprocessing
listings_pp = listings.drop(columns = 
                ["Unnamed: 0.1", "Unnamed: 0", "host_id", "id", # Removing index columns and IDs
                "name", "last_scrape_period", "description", "last_scraped"]) # Removing free text columns

# Renaming long column names to make them easier to work with
listings_pp = listings_pp.rename(columns = {
    "accommodates": "accomm", "instant_bookable": "instant_book",
    "neighbourhood_group_cleansed": "boro", "host_neighbourhood": "nbhd",
    "host_has_profile_pic": "has_pfp", "host_tenure": "tenure", "host_is_superhost": "is_superhost",
    "host_identity_verified": "id_ver", "host_listings_count": "lst_cnt", "host_total_listings_count": "total_lst_cnt",
    "host_response_rate": "rsp_rate", "host_response_time": "rsp_time", "host_acceptance_rate": "accept_rate",
    "availability_30": "avail_30", "availability_365": "avail_365", "has_availability": "has_avail",
    "maximum_nights": "max_nights", "minimum_nights": "min_nights",
    "number_of_reviews": "no_rev", "review_scores_accuracy": "rev_acc", "review_scores_checkin": "rev_checkin",
    "review_scores_cleanliness": "rev_clean", "review_scores_communication": "rev_coms", "review_scores_location": "rev_loc",
    "review_scores_value": "rev_val", "review_scores_rating": "rev_rating",
    "scrape_2024-10": "oct", "scrape_2024-11": "nov", "scrape_2024-12": "dec", "scrape_2025-01": "jan",
    "scrape_2025-02": "feb", "scrape_2025-03": "mar", "scrape_2025-04": "apr", "scrape_2025-05": "may", 
    "scrape_2025-06": "june", "scrape_2025-07": "jul", "scrape_2025-08": "aug", 
    "prop_apartment": "prop_apt", "prop_entire home/apt": "prop_entire_home_apt","prop_hotel room": "prop_hotel_room",
    "prop_private room": "prop_private_room", "prop_shared room": "prop_shared_room",
    "latitude": "lat", "longitude": "lon"
})

In [9]:
# Setting cap of prices at 1500 - accounts for about 95% of Airbnb listings, reducing the heavy right skew and removing outliers
listings_pp = listings_pp[listings_pp["price"] <= 1500]

In [10]:
# Checking missing values
listings_pp.isna().sum()

accomm                                              0
avail_30                                            0
avail_365                                           0
bathrooms                                          72
bedrooms                                          673
beds                                             1037
calculated_host_listings_count                      0
calculated_host_listings_count_entire_homes         0
calculated_host_listings_count_private_rooms        0
calculated_host_listings_count_shared_rooms         0
has_avail                                        2514
accept_rate                                         0
has_pfp                                            46
id_ver                                             46
is_superhost                                     4060
lst_cnt                                            46
nbhd                                            49209
rsp_rate                                            0
rsp_time                    

# Imputing Data

CatBoost deals with missing values natively and does not need any scaling. I plan to leave most missing values as is - but neighborhoods can be dealt with as there are no missing lat/lon values.

In [11]:
# Checking neighborhood names - Tons of Non-NYC neighborhoods (Sao Paulo, Miami Beach??), Gonna use new datasets to map out neighborhoods
listings_pp["nbhd"].unique();

In [13]:
# Loading GeoJSON to map neighborhoods onto with lat/lon
nbhd_gdf = gpd.read_file(r"custom_neighborhoods_nyc.geojson")
nbhd_gdf.columns

Index(['neighborhood', 'boroughCode', 'borough', 'X.id', 'geometry'], dtype='object')

In [14]:
# Resetting index before creating gpd df
listings_pp = listings_pp.reset_index(drop = False) 

# Dropping old columns
listings_pp = listings_pp.drop(columns=["nbhd", "boro"])

# Converting listings to gpd df
listings_gdf = gpd.GeoDataFrame(listings_pp, geometry = gpd.points_from_xy(listings_pp.lon, listings_pp.lat), crs = nbhd_gdf.crs)

# Spatial Join
listings_gdf = gpd.sjoin(listings_gdf, nbhd_gdf[["neighborhood", "borough", "geometry"]], how = "left", predicate = "within")

# Renaming columns to match original df column names
listings_gdf = listings_gdf.rename(columns={"neighborhood": "nbhd", "borough": "boro"})

# Mapping new values by joining
listings_pp = listings_pp.merge(listings_gdf[["index", "nbhd", "boro"]], on = "index", how = "left")

# Dropping index
listings_pp = listings_pp.drop(columns = ["index"])

# Filling in points that did not map out
listings_pp["nbhd"] = listings_pp["nbhd"].fillna("Unknown")
listings_pp["boro"] = listings_pp["boro"].fillna("Unknown")

In [15]:
# Checking missing values again - No missing boroughs or neighborhoods
listings_pp.isna().sum();

# Train Test Split

In [16]:
# Separating target variable from features
X = listings_pp.drop(columns = ["price"])
y = listings_pp["price"]

In [17]:
# Train-test-split for dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2025)

In [18]:
# Log transforming target variable for improved stability
y_train_log = np.log(y_train)
y_test_log = np.log(y_test)

In [19]:
# Assigning non one-hot encoded categories for CatBoost
cat_features = ["boro", "nbhd", "rsp_time"]

In [20]:
# Replacing NaN values with string values for CatBoost
X_train[cat_features] = X_train[cat_features].fillna("Missing")
X_test[cat_features]  = X_test[cat_features].fillna("Missing")

In [21]:
# Assigning CatBoost Pool objects for training and testing sets (after log scale)
train_pool = Pool(data = X_train, label = y_train_log, cat_features = cat_features)
test_pool = Pool(data = X_test, label = y_test_log, cat_features = cat_features)

# CatBoost

## Optuna Trials

In [22]:
# Defining Optuna objective function
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 400, 1600),
        "depth": trial.suggest_int("depth", 4, 14),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log = True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 2, log = True),
        "border_count": trial.suggest_int("border_count", 64, 128),
        "loss_function": "RMSE",
        "verbose": 0,
        "thread_count": -1,
        "task_type": "GPU"
    }
    # Instantiating CatBoost Model 
    model = CatBoostRegressor(**params)
    # Fitting CatBoost
    model.fit(train_pool, eval_set = test_pool, early_stopping_rounds = 50, verbose = 0)
    # Model predictions
    preds_log = model.predict(test_pool)
    # RMSE for each Trial
    rmse_log = root_mean_squared_error(y_test_log, preds_log)
    return rmse_log

In [23]:
# Creating Optuna study for hyperparameter tuning
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 20)

[I 2025-09-27 21:18:28,841] A new study created in memory with name: no-name-32fb2da8-f99d-4c76-b6cc-221f21e73eca
[I 2025-09-27 21:19:04,390] Trial 0 finished with value: 0.18753091760173493 and parameters: {'iterations': 1280, 'depth': 9, 'learning_rate': 0.25848132486041486, 'l2_leaf_reg': 0.06907022265812623, 'border_count': 97}. Best is trial 0 with value: 0.18753091760173493.
[I 2025-09-27 21:19:19,184] Trial 1 finished with value: 0.3164062017382884 and parameters: {'iterations': 1044, 'depth': 6, 'learning_rate': 0.03249734100038222, 'l2_leaf_reg': 0.4369291063486238, 'border_count': 68}. Best is trial 0 with value: 0.18753091760173493.
[I 2025-09-27 21:23:58,505] Trial 2 finished with value: 0.21835518098975276 and parameters: {'iterations': 595, 'depth': 13, 'learning_rate': 0.03785169992826669, 'l2_leaf_reg': 0.1299591808750268, 'border_count': 74}. Best is trial 0 with value: 0.18753091760173493.
[I 2025-09-27 21:24:15,843] Trial 3 finished with value: 0.2548900240571038 and

In [25]:
# Getting best parameters of Optuna study and running that model
best_params = study.best_trial.params
best_model = CatBoostRegressor(**best_params, task_type = "GPU")
best_model.fit(train_pool, eval_set = test_pool, verbose = 0)

<catboost.core.CatBoostRegressor at 0x1f2290ac110>

In [26]:
# Getting feature importance
feature_importance = best_model.get_feature_importance(prettified = True)
print(feature_importance)

                                      Feature Id  Importances
0                                            lon     6.899270
1                                           nbhd     6.626849
2                                     min_nights     5.994288
3                                         accomm     5.993694
4                                            lat     5.770466
5                                        lst_cnt     4.101923
6                                  total_lst_cnt     3.945096
7                               host_since_month     3.786401
8                                       avail_30     3.698416
9                                    accept_rate     3.611715
10                                        tenure     3.575482
11                                     bathrooms     3.488931
12                             prop_private_room     2.914191
13                calculated_host_listings_count     2.844270
14                                     avail_365     2.717939
15      

# Feature Engineering

In [27]:
# Dropping features with lowest feature importance - narrowing down to 38 features total
listings_pp = listings_pp.drop(columns = ["prop_vacation_home", "prop_other", "calculated_host_listings_count_shared_rooms",
                                        "aug", "nov", "has_avail", "may", "june", "apr", "jul", "prop_shared_room", "has_pfp",
                                        "oct", "mar", "prop_apt", "id_ver", "is_superhost", "prop_hotel_room", "prop_entire_home_apt"])

In [29]:
# Trying again with less features
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 20)

[I 2025-09-27 22:40:14,059] A new study created in memory with name: no-name-00a2008e-e4d6-47e5-a5e0-8ecb2540d0d0
[I 2025-09-27 22:43:24,059] Trial 0 finished with value: 0.27409434593544024 and parameters: {'iterations': 1152, 'depth': 10, 'learning_rate': 0.019995780575941858, 'l2_leaf_reg': 0.2029062421106852, 'border_count': 122}. Best is trial 0 with value: 0.27409434593544024.
[I 2025-09-27 22:43:34,000] Trial 1 finished with value: 0.35421710314646077 and parameters: {'iterations': 1041, 'depth': 4, 'learning_rate': 0.018046193830287143, 'l2_leaf_reg': 0.05577537510093967, 'border_count': 117}. Best is trial 0 with value: 0.27409434593544024.
[I 2025-09-27 22:44:09,268] Trial 2 finished with value: 0.29931989669737663 and parameters: {'iterations': 1218, 'depth': 6, 'learning_rate': 0.046645075400369745, 'l2_leaf_reg': 0.010565762641304191, 'border_count': 79}. Best is trial 0 with value: 0.27409434593544024.
[I 2025-09-27 22:50:42,537] Trial 3 finished with value: 0.18392516886

In [30]:
# Getting best parameters of Optuna study and running that model
best_params = study.best_trial.params
best_model = CatBoostRegressor(**best_params, task_type = "GPU")
best_model.fit(train_pool, eval_set = test_pool, verbose = 0)

<catboost.core.CatBoostRegressor at 0x1f2290afa10>

In [31]:
# Getting feature importance
feature_importance = best_model.get_feature_importance(prettified = True)
print(feature_importance)

                                      Feature Id  Importances
0                                           nbhd     7.722214
1                                            lon     6.249983
2                                         accomm     5.813094
3                                     min_nights     5.804737
4                                            lat     5.555680
5                               host_since_month     4.179289
6                                       avail_30     3.964570
7                                        lst_cnt     3.936082
8                                      bathrooms     3.478107
9                                    accept_rate     3.454592
10                                        tenure     3.258653
11                                 total_lst_cnt     2.828612
12                                     avail_365     2.773913
13                calculated_host_listings_count     2.752757
14                                      bedrooms     2.734219
15      