# Feature Engineering

## Loading Libraries & Data

In [44]:
# Loading Libraries

# Data Analysis
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
# Other
import duckdb as ddb
from pathlib import Path
from tqdm.notebook import tqdm
tqdm.pandas() 

In [45]:
# Loading the CSV
file_path = Path("..") / "data" / "listings_clean.csv"
listings = pd.read_csv(file_path)

  listings = pd.read_csv(file_path)


## Collapsing Categorial Columns

In [46]:
# Viewing column names
listings.columns

Index(['accommodates', 'amenities', 'availability_30', 'availability_365',
       'bathrooms', 'bathrooms_text', 'bedrooms', 'beds',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'description',
       'has_availability', 'host_acceptance_rate', 'host_has_profile_pic',
       'host_id', 'host_identity_verified', 'host_is_superhost',
       'host_listings_count', 'host_neighbourhood', 'host_response_rate',
       'host_response_time', 'host_since', 'host_total_listings_count',
       'host_verifications', 'id', 'instant_bookable', 'last_scraped',
       'latitude', 'longitude', 'maximum_nights', 'minimum_nights', 'name',
       'neighbourhood_group_cleansed', 'number_of_reviews', 'price',
       'property_type', 'review_scores_accuracy', 'review_scores_checkin',
       'review_scores_cleanliness', 'review_scores_communication',
    

# Bed / Bath Columns

In [47]:
rooms = listings[["bathrooms", "bathrooms_text", "bedrooms", "beds"]].copy()

In [48]:
rooms.describe(include = "all")

Unnamed: 0,bathrooms,bathrooms_text,bedrooms,beds
count,245101.0,244983,244500.0,244132.0
unique,,37,,
top,,1 bath,,
freq,,122533,,
mean,1.195907,,1.345611,1.64237
std,0.557604,,0.964321,1.205072
min,0.0,,0.0,0.0
25%,1.0,,1.0,1.0
50%,1.0,,1.0,1.0
75%,1.0,,2.0,2.0


In [49]:
rooms["bathrooms_text"].unique()

array(['1 private bath', '1 shared bath', '1 bath', '2 baths',
       '1.5 shared baths', '1.5 baths', '3 baths', '2.5 baths', nan,
       '5 baths', '0 shared baths', 'Shared half-bath', '2 shared baths',
       '3.5 baths', '4.5 baths', 'Private half-bath', '0 baths',
       '15.5 baths', '3 shared baths', '10.5 baths', '4 baths',
       '2.5 shared baths', 'Half-bath', '3.5 shared baths',
       '4 shared baths', '4.5 shared baths', '5.5 baths', '6 baths',
       '7 baths', '6 shared baths', '7.5 baths', '5 shared baths',
       '9 baths', '7 shared baths', '6.5 baths', '7.5 shared baths',
       '11.5 baths', '8 baths'], dtype=object)

In [50]:
# Strings that implies that bathroom was shared
shared_bath_texts = "shared bath|shared baths"

# Adding a 0.5 value for Half-Baths
rooms.loc[rooms["bathrooms_text"].str.contains("half", case = False, na = False), "bathrooms"] = 0.5

# Adding a binary-indicator column for shared bathrooms, ignoring Na/NaNs (1 = shared)
rooms["shared_bath"] = (rooms["bathrooms_text"]
                        .str.contains(shared_bath_texts, case = False, na = False)
                        .astype(int))

In [51]:
# Now bathrooms_text can be removed and shared_bath can be added

# Adding shared_bath column
listings["shared_bath"] = rooms["shared_bath"].copy()

# Removing bathrooms_text column
listings = listings.drop(columns = ["bathrooms_text"])

In [52]:
# Bedrooms are fine
rooms["bedrooms"].describe()

count    244500.000000
mean          1.345611
std           0.964321
min           0.000000
25%           1.000000
50%           1.000000
75%           2.000000
max          16.000000
Name: bedrooms, dtype: float64

In [53]:
# Beds
listings["beds"].unique()

array([ 1.,  4.,  2.,  3.,  8.,  6.,  7.,  5.,  0.,  9., 19., 40., 12.,
       10., 11., 14., 13., nan, 16., 21., 42., 17.])

In [54]:
# Too many bedrooms?? If it is not an entire home or apartment and there are more than a dozen rooms, it could be an outlier
too_many_beds = listings[(listings["beds"] >= 12) & (listings["room_type"] != "Entire home/apt")]

In [55]:
# I guess a single row with 3 bedrooms and 12 beds isn"t impossible, though that would suck. It stays.
too_many_beds

Unnamed: 0,accommodates,amenities,availability_30,availability_365,bathrooms,bedrooms,beds,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,...,review_scores_accuracy,review_scores_checkin,review_scores_cleanliness,review_scores_communication,review_scores_location,review_scores_rating,review_scores_value,reviews_per_month,room_type,shared_bath
43202,2,"[""Self check-in"", ""BBQ grill"", ""Lockbox"", ""Fir...",29,364,2.0,3.0,12.0,1,0,1,...,,,,,,,,,Private room,0


# Availability Columns

In [56]:
# Creating copy slice of just the availability columns
avail = listings[["availability_30", "availability_365", "maximum_nights", "minimum_nights"]].copy()

In [57]:
# These columns are fine
avail.describe()

Unnamed: 0,availability_30,availability_365,maximum_nights,minimum_nights
count,245173.0,245173.0,245173.0,245173.0
mean,14.916365,241.655774,468.339303,26.988412
std,12.718885,110.858411,403.520096,27.195109
min,0.0,0.0,1.0,1.0
25%,0.0,153.0,120.0,30.0
50%,15.0,269.0,365.0,30.0
75%,29.0,348.0,730.0,30.0
max,30.0,365.0,10000.0,730.0


# Binary Columns

In [58]:
# Creating a copy slice of variables with boolean values
binaries = listings[["has_availability",
                    "host_has_profile_pic",
                    "host_is_superhost",
                    "host_identity_verified",
                    "instant_bookable"]].copy()

In [59]:
binaries.head()

Unnamed: 0,has_availability,host_has_profile_pic,host_is_superhost,host_identity_verified,instant_bookable
0,True,True,False,True,False
1,True,True,False,False,False
2,True,True,True,True,False
3,True,True,False,True,False
4,True,True,False,True,False


In [60]:
# Ensuring 
for col in binaries:
    print(listings[col].unique())

[True nan False]
[True False nan]
[False True nan]
[True False nan]
[False  True]


In [61]:
# Binary encoding boolean values - astype(int) cannot deal with missing values which will be dealt with later
for col in binaries:
    listings[col] = listings[col].map({True: 1, False: 0})

In [62]:
# Checking results
binaries.head()

Unnamed: 0,has_availability,host_has_profile_pic,host_is_superhost,host_identity_verified,instant_bookable
0,True,True,False,True,False
1,True,True,False,False,False
2,True,True,True,True,False
3,True,True,False,True,False
4,True,True,False,True,False


# Host Properties

In [63]:
# Creating copied slice of host properties
host_props = listings[["host_acceptance_rate",
                    "host_listings_count",
                    "host_response_rate",
                    "host_since",
                    "host_total_listings_count",
                    "host_verifications",
                    ]].copy()

In [64]:
host_props.head()

Unnamed: 0,host_acceptance_rate,host_listings_count,host_response_rate,host_since,host_total_listings_count,host_verifications
0,81%,4.0,100%,2008-09-07,13.0,"['email', 'phone']"
1,33%,2.0,100%,2009-05-07,2.0,"['email', 'phone', 'work_email']"
2,96%,4.0,100%,2009-06-12,4.0,"['email', 'phone']"
3,,2.0,100%,2009-07-10,3.0,"['email', 'phone']"
4,0%,1.0,,2009-08-06,1.0,"['email', 'phone']"


In [65]:
# Checking Host Verifications
listings["host_verifications"].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", "['email']", nan, '[]'],
      dtype=object)

# Amenities

In [66]:
# Amenities column contains far too much cardinality and irrelevance. A smoke detector and fire extinguishers are truly not amenities. The column is best removed.
listings = listings.drop(columns = ["amenities"])

# Host Acceptance & Response Rates

In [67]:
# Percentages not suitable
listings["host_acceptance_rate"].isna().sum()

np.int64(38154)

In [68]:
# Fixing acceptance rate feature
listings["host_acceptance_rate"] = listings["host_acceptance_rate"].fillna("0%")  # Temporary filling NaN values with 0s
listings["host_acceptance_rate"] = listings["host_acceptance_rate"].str.rstrip("%").astype(float) / 100 # Removing percentage signs and turning percentages into floats

In [69]:
# Fixing response rate feature
listings["host_response_rate"] = listings["host_response_rate"].fillna("0%")  # Temporary filling NaN values with 0s
listings["host_response_rate"] = listings["host_response_rate"].str.rstrip("%").astype(float) / 100 # Removing percentage signs and turning percentages into floats

# Host Verifications

In [70]:
# There is already a column for whether or not a host verified at all, which is likely more important than in which ways they verified, which would add too much redundancy to keep as features
listings["host_verifications"].unique()

array(["['email', 'phone']", "['email', 'phone', 'work_email']",
       "['phone']", "['phone', 'work_email']", "['email']", nan, '[]'],
      dtype=object)

In [71]:
listings = listings.drop(columns = ["host_verifications"])

# Date Features

In [72]:
# Timestamps would not make a great feature non time-series based ML

# Ensure datetime
listings["host_since"] = pd.to_datetime(listings["host_since"], errors="coerce")

# Host tenure in years (numeric) - how long they have been a host
listings["host_tenure"] = (pd.Timestamp.today() - listings["host_since"]).dt.days / 365

# Year the host joined (numeric, for trends)
listings["host_since_year"] = listings["host_since"].dt.year

# Month the host joined (numeric, 1-12, for seasonality)
listings["host_since_month"] = listings["host_since"].dt.month

In [73]:
# Dropping host_since column
listings = listings.drop(columns = ["host_since"])

In [74]:
with pd.option_context("display.max_columns", None):
    display(listings.head())

Unnamed: 0,accommodates,availability_30,availability_365,bathrooms,bedrooms,beds,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,description,has_availability,host_acceptance_rate,host_has_profile_pic,host_id,host_identity_verified,host_is_superhost,host_listings_count,host_neighbourhood,host_response_rate,host_response_time,host_total_listings_count,id,instant_bookable,last_scraped,latitude,longitude,maximum_nights,minimum_nights,name,neighbourhood_group_cleansed,number_of_reviews,price,property_type,review_scores_accuracy,review_scores_checkin,review_scores_cleanliness,review_scores_communication,review_scores_location,review_scores_rating,review_scores_value,reviews_per_month,room_type,shared_bath,host_tenure,host_since_year,host_since_month
0,2,23,358,1.0,1.0,1.0,4,0,4,0,Renovated apt home in elevator building.,1.0,0.81,1.0,2787,1.0,0.0,4.0,Gravesend,1.0,within an hour,13.0,2539,0,2025-04-02,40.64529,-73.97238,730,30,Superfast Wi-Fi. Clean & quiet home by the park,Brooklyn,9,$128.00,Private room in rental unit,4.88,5.0,5.0,5.0,4.75,4.89,4.88,0.08,Private room,0,17.060274,2008.0,9.0
1,1,23,83,1.0,1.0,1.0,2,0,2,0,This charming distancing-friendly month-to-mon...,1.0,0.33,1.0,16104,0.0,0.0,2.0,East Harlem,1.0,within a few hours,2.0,6872,0,2025-04-02,40.80107,-73.94255,180,30,Uptown Sanctuary w/ Private Bath (Month to Month),Manhattan,1,$59.00,Private room in condo,5.0,5.0,5.0,5.0,5.0,5.0,5.0,0.03,Private room,1,16.39726,2009.0,5.0
2,2,0,0,1.0,0.0,1.0,1,1,0,0,A huge loft in a repurposed factory building i...,1.0,0.96,1.0,21207,1.0,1.0,4.0,Williamsburg,1.0,within an hour,4.0,7801,0,2025-04-02,40.718807,-73.956177,120,30,Sunny Williamsburg Loft with Sauna,Brooklyn,13,$290.00,Entire place,4.78,4.89,4.67,4.78,5.0,4.91,4.89,0.07,Entire home/apt,0,16.29863,2009.0,6.0
3,5,3,165,1.0,1.0,4.0,2,2,0,0,Soak up the modern and vintage charm<br />of t...,1.0,0.0,1.0,25183,1.0,0.0,2.0,Bedford-Stuyvesant,1.0,within an hour,3.0,8490,0,2025-04-02,40.684556,-73.939634,1125,30,"Maison des Sirenes1,bohemian, luminous apartment",Brooklyn,190,$170.00,Entire loft,4.83,4.88,4.74,4.88,4.67,4.77,4.76,1.0,Entire home/apt,0,16.221918,2009.0,7.0
4,2,0,0,1.0,,1.0,1,1,0,0,PLEASE DO NOT REQUEST TO BOOK UNTIL WE HAVE ME...,1.0,0.0,1.0,30193,1.0,0.0,1.0,Hell's Kitchen,0.0,,1.0,9357,0,2025-04-02,40.76724,-73.98664,120,30,Midtown Pied-a-terre,Manhattan,58,$175.00,Entire rental unit,4.68,4.97,4.16,5.0,4.95,4.52,4.58,0.31,Entire home/apt,0,16.147945,2009.0,8.0


In [75]:
# Converting last_scraped to datetime
listings["last_scraped"] = pd.to_datetime(listings["last_scraped"])

# Turning "Last Scraped" into a monthly and yearly feature - unfortunately, although there is a temporal component, it is not continuous, so they are better treated as categories
listings['last_scrape_period'] = listings['last_scraped'].dt.to_period('M')

# One-hot encoding
last_scrape_onehot = pd.get_dummies(listings['last_scrape_period'], prefix = 'scrape').astype(int)
listings = pd.concat([listings, last_scrape_onehot], axis=1)

# Property & Room Type

In [76]:
listings["property_type"].unique()

array(['Private room in rental unit', 'Private room in condo',
       'Entire place', 'Entire loft', 'Entire rental unit',
       'Private room in guest suite', 'Private room in townhouse',
       'Entire guesthouse', 'Entire townhouse', 'Entire condo',
       'Private room in loft', 'Entire home', 'Private room in home',
       'Room in boutique hotel', 'Private room in bed and breakfast',
       'Entire serviced apartment', 'Entire guest suite',
       'Private room in houseboat', 'Private room',
       'Shared room in rental unit', 'Private room in guesthouse', 'Boat',
       'Private room in religious building', 'Casa particular',
       'Private room in villa', 'Shared room in home',
       'Shared room in townhouse', 'Private room in tiny home',
       'Entire bungalow', 'Room in hotel', 'Houseboat',
       'Shared room in condo', 'Entire villa', 'Entire cottage',
       'Room in serviced apartment', 'Tiny home', 'Shared room in loft',
       'Private room in serviced apartment',

In [77]:
# Making text lowercase
listings["property_type"] = listings["property_type"].str.lower()

# Function to removing redundant text captured in other features
def clean_property_types(col):
    rem_props = ["entire", "private", "shared", "room", "in"]
    return col.apply(lambda x: " ".join([word for word in x.split() if word not in rem_props]))

In [78]:
# Applying the function, checking results
listings["property_type_clean"] = clean_property_types(listings["property_type"])

# Dropping original column
listings = listings.drop(columns = ["property_type"])

In [79]:
# Checking counts of each property type
listings["property_type_clean"].unique()

array(['rental unit', 'condo', 'place', 'loft', 'guest suite',
       'townhouse', 'guesthouse', 'home', 'boutique hotel',
       'bed and breakfast', 'serviced apartment', 'houseboat', '', 'boat',
       'religious building', 'casa particular', 'villa', 'tiny home',
       'bungalow', 'hotel', 'cottage', 'aparthotel', 'vacation home',
       'dorm', 'earthen home', 'resort', 'barn', 'camper/rv', 'hostel',
       'kezhan', 'ranch', 'tower', 'castle', 'dome'], dtype=object)

In [80]:
listings["property_type_clean"].value_counts()

property_type_clean
rental unit           164362
home                   32239
condo                  11234
townhouse              10620
hotel                   8500
guest suite             4307
loft                    3971
serviced apartment      2604
boutique hotel          2189
guesthouse               657
bed and breakfast        654
place                    650
aparthotel               613
casa particular          454
resort                   309
                         300
vacation home            285
hostel                   278
bungalow                 221
villa                    174
tiny home                135
camper/rv                 99
houseboat                 68
boat                      49
earthen home              43
cottage                   43
tower                     37
religious building        21
dome                      11
kezhan                    11
ranch                     11
barn                      10
castle                    10
dorm                   

In [81]:
# Collapsing categories - I have never seen a barn, a dome(?), a tower, a castle, or an "earthen home" in NYC...
apartment_like = ["rental unit", "condo", "place", "loft", "guest suite", "townhouse", "guesthouse", "home", "serviced apartment"]
hotel_like = ["hotel", "boutique hotel", "aparthotel", "bed and breakfast", "resort"]
vacation_homes = ["villa", "bungalow", "tiny home", "cottage", "vacation home", "casa particular"]
other = ["houseboat", "boat", "religious building", "dorm", "earthen home", "barn", "camper/rv", "hostel", "kezhan", "ranch", "tower", "castle", "dome"]

# Creating a mapping
mapping = {key: "apartment" for key in apartment_like}
mapping.update({key: "hotel" for key in hotel_like})
mapping.update({key: "vacation_home" for key in vacation_homes})
mapping.update({key: "other" for key in other})

# Applying the mapping
listings["property_type"] = listings["property_type_clean"].map(mapping)

In [82]:
# One-Hot Encoding the property types
property_dummies = pd.get_dummies(listings["property_type"], prefix = "prop")
listings = pd.concat([listings, property_dummies], axis = 1)

# Dropping property_type column
listings = listings.drop(columns=["property_type"])

In [83]:
# Dropping property_types_clean
listings = listings.drop(columns = ["property_type_clean"])

In [84]:
# Converting booleans into 0/1
listings[["prop_apartment", "prop_hotel", "prop_other", "prop_vacation_home"]] = listings[["prop_apartment", "prop_hotel", "prop_other", "prop_vacation_home"]].astype(int)

In [85]:
# Dealing with room types
listings["room_type"] = listings["room_type"].str.lower()

room_type_dummies = pd.get_dummies(listings["room_type"], prefix = "prop").astype(int)
listings = pd.concat([listings, room_type_dummies], axis = 1)

# Dropping original room_type column
listings = listings.drop(columns = ["room_type"])

# Saving Processed Data 

In [86]:
listings.to_csv("listings_cleaner.csv")