# CatBoost Model

## Importing Libraries & Data

In [1]:
import numpy as np 
import pandas as pd 
import geopandas as gpd

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from catboost import CatBoostRegressor, Pool

import warnings
warnings.filterwarnings("ignore", category = RuntimeWarning)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
listings = pd.read_csv(r"listings_cleaner.csv")

## Dropping & Renaming Columns

In [3]:
listings.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'accommodates', 'availability_30',
       'availability_365', 'bathrooms', 'bedrooms', 'beds',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'description',
       'has_availability', 'host_acceptance_rate', 'host_has_profile_pic',
       'host_id', 'host_identity_verified', 'host_is_superhost',
       'host_listings_count', 'host_neighbourhood', 'host_response_rate',
       'host_response_time', 'host_total_listings_count', 'id',
       'instant_bookable', 'last_scraped', 'latitude', 'longitude',
       'maximum_nights', 'minimum_nights', 'name',
       'neighbourhood_group_cleansed', 'number_of_reviews', 'price',
       'review_scores_accuracy', 'review_scores_checkin',
       'review_scores_cleanliness', 'review_scores_communication',
       'review_scores_location', 'review_scores_rating', '

In [4]:
# Listings preprocessing
listings_pp = listings.drop(columns = 
                ["Unnamed: 0.1", "Unnamed: 0", "host_id", "id", # Removing index columns and IDs
                "name", "last_scrape_period", "description", "last_scraped"]) # Removing free text columns

# Renaming long column names to make them easier to work with
listings_pp = listings_pp.rename(columns = {
    "accommodates": "accomm", "instant_bookable": "instant_book",
    "neighbourhood_group_cleansed": "boro", "host_neighbourhood": "nbhd",
    "host_has_profile_pic": "has_pfp", "host_tenure": "tenure", "host_is_superhost": "is_superhost",
    "host_identity_verified": "id_ver", "host_listings_count": "lst_cnt", "host_total_listings_count": "total_lst_cnt",
    "host_response_rate": "rsp_rate", "host_response_time": "rsp_time", "host_acceptance_rate": "accept_rate",
    "availability_30": "avail_30", "availability_365": "avail_365", "has_availability": "has_avail",
    "maximum_nights": "max_nights", "minimum_nights": "min_nights",
    "number_of_reviews": "no_rev", "review_scores_accuracy": "rev_acc", "review_scores_checkin": "rev_checkin",
    "review_scores_cleanliness": "rev_clean", "review_scores_communication": "rev_coms", "review_scores_location": "rev_loc",
    "review_scores_value": "rev_val", "review_scores_rating": "rev_rating",
    "scrape_2024-10": "oct", "scrape_2024-11": "nov", "scrape_2024-12": "dec", "scrape_2025-01": "jan",
    "scrape_2025-02": "feb", "scrape_2025-03": "mar", "scrape_2025-04": "apr", "scrape_2025-05": "may", 
    "scrape_2025-06": "june", "scrape_2025-07": "jul", "scrape_2025-08": "aug", 
    "prop_apartment": "prop_apt", "prop_entire home/apt": "prop_entire_home_apt","prop_hotel room": "prop_hotel_room",
    "prop_private room": "prop_private_room", "prop_shared room": "prop_shared_room",
    "latitude": "lat", "longitude": "lon"
})

In [5]:
# Setting cap of prices at 1500 - accounts for about 95% of Airbnb listings, reducing the heavy right skew and removing outliers
listings_pp = listings_pp[listings_pp["price"] <= 1500]

In [6]:
# Checking missing values
listings_pp.isna().sum()

accomm                                              0
avail_30                                            0
avail_365                                           0
bathrooms                                          72
bedrooms                                          673
beds                                             1037
calculated_host_listings_count                      0
calculated_host_listings_count_entire_homes         0
calculated_host_listings_count_private_rooms        0
calculated_host_listings_count_shared_rooms         0
has_avail                                        2514
accept_rate                                         0
has_pfp                                            46
id_ver                                             46
is_superhost                                     4060
lst_cnt                                            46
nbhd                                            49209
rsp_rate                                            0
rsp_time                    

# Imputing Data

CatBoost deals with missing values natively and does not need any scaling. I plan to leave most missing values as is - but neighborhoods can be dealt with as there are no missing lat/lon values.

In [7]:
# Checking neighborhood names - Tons of Non-NYC neighborhoods (Sao Paulo, Miami Beach??), Gonna use new datasets to map out neighborhoods
listings_pp["nbhd"].unique();

In [8]:
# Loading GeoJSON to map neighborhoods onto with lat/lon
nbhd_gdf = gpd.read_file(r"custom_neighborhoods_nyc.geojson")
nbhd_gdf.columns

Index(['neighborhood', 'boroughCode', 'borough', 'X.id', 'geometry'], dtype='object')

In [9]:
# Resetting index before creating gpd df
listings_pp = listings_pp.reset_index(drop = False) 

# Dropping old columns
listings_pp = listings_pp.drop(columns=["nbhd", "boro"])

# Converting listings to gpd df
listings_gdf = gpd.GeoDataFrame(listings_pp, geometry = gpd.points_from_xy(listings_pp.lon, listings_pp.lat), crs = nbhd_gdf.crs)

# Spatial Join
listings_gdf = gpd.sjoin(listings_gdf, nbhd_gdf[["neighborhood", "borough", "geometry"]], how = "left", predicate = "within")

# Renaming columns to match original df column names
listings_gdf = listings_gdf.rename(columns={"neighborhood": "nbhd", "borough": "boro"})

# Mapping new values by joining
listings_pp = listings_pp.merge(listings_gdf[["index", "nbhd", "boro"]], on = "index", how = "left")

# Dropping index
listings_pp = listings_pp.drop(columns = ["index"])

# Filling in points that did not map out
listings_pp["nbhd"] = listings_pp["nbhd"].fillna("Unknown")
listings_pp["boro"] = listings_pp["boro"].fillna("Unknown")

In [10]:
# Checking missing values again - No missing boroughs or neighborhoods
listings_pp.isna().sum();

# Train Test Split

In [11]:
# Separating target variable from features
X = listings_pp.drop(columns = ["price"])
y = listings_pp["price"]

In [12]:
# Train-test-split for dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2025)

In [13]:
# # Log transforming target variable for improved stability
# y_train_log = np.log(y_train)
# y_test_log = np.log(y_test)

In [14]:
# Assigning non one-hot encoded categories for CatBoost
cat_features = ["boro", "nbhd", "rsp_time"]

In [15]:
# Replacing NaN values with string values for CatBoost
X_train[cat_features] = X_train[cat_features].fillna("Missing")
X_test[cat_features]  = X_test[cat_features].fillna("Missing")

In [16]:
# # Assigning CatBoost Pool objects for training and testing sets (after log scale)
# train_pool = Pool(data = X_train, label = y_train_log, cat_features = cat_features)
# test_pool = Pool(data = X_test, label = y_test_log, cat_features = cat_features)

In [17]:
# Assigning CatBoost Pool objects for training and testing sets
train_pool = Pool(data = X_train, label = y_train, cat_features = cat_features)
test_pool = Pool(data = X_test, label = y_test, cat_features = cat_features)

# CatBoost

## Optuna Trials

In [18]:
# # Defining Optuna objective function
# def objective(trial):
#     params = {
#         "iterations": trial.suggest_int("iterations", 400, 1600),
#         "depth": trial.suggest_int("depth", 4, 14),
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log = True),
#         "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 2, log = True),
#         "border_count": trial.suggest_int("border_count", 64, 128),
#         "loss_function": "RMSE",
#         "verbose": 0,
#         "thread_count": -1,
#         "task_type": "GPU"
#     }
#     # Instantiating CatBoost Model 
#     model = CatBoostRegressor(**params)
#     # Fitting CatBoost
#     model.fit(train_pool, eval_set = test_pool, early_stopping_rounds = 50, verbose = 0)
#     # Model predictions
#     preds_log = model.predict(test_pool)
#     # RMSE for each Trial
#     rmse_log = root_mean_squared_error(y_test_log, preds_log)
#     return rmse_log

In [19]:
# Defining Optuna objective function for MAE
def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 400, 1600),
        "depth": trial.suggest_int("depth", 4, 14),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 2, log=True),
        "border_count": trial.suggest_int("border_count", 64, 128),
        "loss_function": "MAE",  # Use MAE as CatBoost loss
        "verbose": 0,
        "thread_count": -1,
        "task_type": "GPU"
    }

    # Instantiating CatBoost Model 
    model = CatBoostRegressor(**params)

    # Fitting CatBoost
    model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, verbose=0)

    # Model predictions (original scale)
    preds = model.predict(test_pool)

    # MAE for each trial
    mae = mean_absolute_error(y_test, preds)

    return mae


In [20]:
# Creating Optuna study for hyperparameter tuning
study = optuna.create_study(direction = "minimize")
study.optimize(objective, n_trials = 20)

[I 2025-10-04 14:58:54,281] A new study created in memory with name: no-name-4f3ce54a-d5de-4286-bcda-d83062b7859c
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2025-10-04 14:59:09,560] Trial 0 finished with value: 103.77047960146687 and parameters: {'iterations': 741, 'depth': 8, 'learning_rate': 0.0524652001950576, 'l2_leaf_reg': 0.18517304827366027, 'border_count': 85}. Best is trial 0 with value: 103.77047960146687.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2025-10-04 15:15:12,163] Trial 1 finished with value: 103.97367249858857 and parameters: {'iterations': 1391, 'depth': 12, 'learning_rate': 0.024384893571368187, 'l2_leaf_reg': 0.4864330593514062, 'border_count': 80}. Best is trial 0 with value: 103.77047960146687.
Default metric period is 5 because MAE is/are not implemented for GPU
[I 2025-10-04 15:15:27,299] Trial 2 finished with value: 104.40839576488418 and parameters: {'iterations': 987, 'depth': 5, 'learning_rate': 

In [21]:
# Getting best parameters of Optuna study and running that model
best_params = study.best_trial.params
best_model = CatBoostRegressor(**best_params, task_type = "GPU")
best_model.fit(train_pool, eval_set = test_pool, verbose = 0)

<catboost.core.CatBoostRegressor at 0x21d55037f10>

In [22]:
# Getting feature importance
feature_importance = best_model.get_feature_importance(prettified = True)
print(feature_importance)

                                      Feature Id  Importances
0                                            lon     8.516266
1                                           nbhd     7.011327
2                                         accomm     6.286582
3                                     min_nights     6.062270
4                                      bathrooms     5.842530
5                                            lat     5.359759
6                                        lst_cnt     4.542871
7                                         tenure     4.034548
8                                  total_lst_cnt     3.946952
9                                    accept_rate     3.823221
10                                      avail_30     3.066339
11                                      bedrooms     2.926937
12                                     avail_365     2.839110
13                              host_since_month     2.663023
14                                    max_nights     2.435877
15      