In [12]:
# CELL 1 – imports

import pandas as pd
import numpy as np



In [13]:
calendar = pd.read_csv("../Data/calendar.csv.gz", compression="gzip")
listings = pd.read_csv("../Data/listings.csv.gz", compression="gzip")
reviews  = pd.read_csv("../Data/reviews.csv.gz", compression="gzip")  # <-- IMPORTANT

print("calendar:", calendar.shape)
print("listings:", listings.shape)
print("reviews:", reviews.shape)

print(reviews.columns)  # quick check

calendar: (365, 7)
listings: (2876, 79)
reviews: (318549, 6)
Index(['listing_id', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments'], dtype='object')


In [14]:
# CELL 3 – basic cleaning (dates, prices, availability)

# --- calendar types ---
calendar['date'] = pd.to_datetime(calendar['date'])
calendar['available'] = calendar['available'].map({'t': 1, 'f': 0})

# price cleaner
def clean_price(x):
    try:
        return float(str(x).replace("$", "").replace(",", ""))
    except:
        return np.nan

calendar['price'] = calendar['price'].apply(clean_price)
calendar['adjusted_price'] = calendar['adjusted_price'].apply(clean_price)

# --- listings types ---
listings['last_review'] = pd.to_datetime(listings['last_review'], errors='coerce')
listings['price'] = listings['price'].apply(clean_price)

# --- reviews types ---
reviews['date'] = pd.to_datetime(reviews['date'], errors='coerce')

calendar.head()


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,686088974677118082,2025-06-27,0,,,2,1125
1,686088974677118082,2025-06-28,0,,,2,1125
2,686088974677118082,2025-06-29,1,,,2,1125
3,686088974677118082,2025-06-30,1,,,2,1125
4,686088974677118082,2025-07-01,1,,,2,1125


In [15]:
# CELL 4 – select only useful columns

# from calendar
calendar_small = calendar[[
    "listing_id",
    "date",
    "available",        # target (later)
    "price",
    "adjusted_price",
    "minimum_nights",
    "maximum_nights"
]]

# from listings
listings_small = listings[[
    "id",
    "host_id",
    "neighbourhood",
    "latitude",
    "longitude",
    "room_type",
    "price",                    # listing's base price
    "number_of_reviews",
    "reviews_per_month",
    "calculated_host_listings_count",
    "availability_365",
    "last_review",
    "description",
    "neighborhood_overview",
    "amenities"
]]

calendar_small.head()


Unnamed: 0,listing_id,date,available,price,adjusted_price,minimum_nights,maximum_nights
0,686088974677118082,2025-06-27,0,,,2,1125
1,686088974677118082,2025-06-28,0,,,2,1125
2,686088974677118082,2025-06-29,1,,,2,1125
3,686088974677118082,2025-06-30,1,,,2,1125
4,686088974677118082,2025-07-01,1,,,2,1125


In [16]:
# CELL 5 – aggregate reviews per listing

reviews_agg = reviews.groupby("listing_id").agg(
    review_count=("id", "count"),
    avg_review_length=("comments", lambda x: x.astype(str).str.len().mean()),
    combined_text=("comments", lambda x: " ".join(x.astype(str)))
).reset_index()

reviews_agg.head()



Unnamed: 0,listing_id,review_count,avg_review_length,combined_text
0,155305,454,212.200441,We had a wonderful time! The cottage was very ...
1,197263,87,196.827586,Timothy was kind and clean. He let me come and...
2,209068,67,415.522388,Absolutely enjoyed our stay w/ Kevin and Anne....
3,246315,53,387.339623,This cabin was perfect! It's literally a log c...
4,314540,35,328.685714,We spent four nights in Tom's beautiful house....


In [17]:
# CELL 6 – merge everything into one unified DataFrame

# 1) calendar + listings (listing_id ↔ id)
df = calendar_small.merge(
    listings_small,
    left_on="listing_id",
    right_on="id",
    how="left"
)

# 2) add aggregated review stats
df = df.merge(
    reviews_agg,
    on="listing_id",
    how="left"
)

print("Unified df shape:", df.shape)
df.head()


Unified df shape: (365, 25)


Unnamed: 0,listing_id,date,available,price_x,adjusted_price,minimum_nights,maximum_nights,id,host_id,neighbourhood,...,reviews_per_month,calculated_host_listings_count,availability_365,last_review,description,neighborhood_overview,amenities,review_count,avg_review_length,combined_text
0,686088974677118082,2025-06-27,0,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,2.42,1,163,2025-05-26,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...
1,686088974677118082,2025-06-28,0,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,2.42,1,163,2025-05-26,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...
2,686088974677118082,2025-06-29,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,2.42,1,163,2025-05-26,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...
3,686088974677118082,2025-06-30,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,2.42,1,163,2025-05-26,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...
4,686088974677118082,2025-07-01,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,2.42,1,163,2025-05-26,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...


In [None]:
 6. Basic cleaning of the merged dataset
# ------------------------------------------------------------

# Remove rows without target
df = df.dropna(subset=["available_target"])

# Identify numeric and categorical columns (excluding identifiers)
id_cols = ["listing_id", "date"]
target_col = "available_target"

numeric_cols = df.select_dtypes(include=["number"]).columns.tolist()
# Ensure we don't accidentally treat the target as a feature in this step
numeric_feature_cols = [c for c in numeric_cols if c not in [target_col]]

categorical_cols = df.select_dtypes(include=["object", "category"]).columns.tolist()

# Fill numeric missing values with median (simple, robust strategy)
for col in numeric_feature_cols:
    median_value = df[col].median()
    df[col] = df[col].fillna(median_value)

# Fill categorical missing with "Unknown"
for col in categorical_cols:
    df[col] = df[col].fillna("Unknown").astype("category")



In [None]:
7. Optional: One-hot encode categorical features
# ------------------------------------------------------------
# For an ML-ready dataset, it is often convenient to convert categories to dummies.
# We keep listing_id and date as identifier columns and do NOT encode them.

categorical_to_encode = [c for c in categorical_cols if c not in id_cols]

ml_dataset = df.copy()

if len(categorical_to_encode) > 0:
    ml_dataset = pd.get_dummies(
        ml_dataset,
        columns=categorical_to_encode,
        drop_first=True  # avoid dummy trap
    )

print("Final ML dataset shape:", ml_dataset.shape)

# Show a preview
ml_dataset.head()

In [18]:
# CELL 7 – basic feature engineering

# ---- temporal features ----
df["year"] = df["date"].dt.year
df["month"] = df["date"].dt.month
df["day"] = df["date"].dt.day
df["day_of_week"] = df["date"].dt.dayofweek
df["is_weekend"] = df["day_of_week"].isin([5, 6]).astype(int)

# ---- recency features ----
df["days_since_last_review"] = (df["date"] - df["last_review"]).dt.days
df["days_since_last_review"] = df["days_since_last_review"].fillna(9999)

# ---- fill missing simple numeric values ----
df["review_count"] = df["review_count"].fillna(0)
df["avg_review_length"] = df["avg_review_length"].fillna(0)

df.head()


Unnamed: 0,listing_id,date,available,price_x,adjusted_price,minimum_nights,maximum_nights,id,host_id,neighbourhood,...,amenities,review_count,avg_review_length,combined_text,year,month,day,day_of_week,is_weekend,days_since_last_review
0,686088974677118082,2025-06-27,0,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...,2025,6,27,4,0,32
1,686088974677118082,2025-06-28,0,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...,2025,6,28,5,1,33
2,686088974677118082,2025-06-29,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...,2025,6,29,6,1,34
3,686088974677118082,2025-06-30,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...,2025,6,30,0,0,35
4,686088974677118082,2025-07-01,1,,,2,1125,686088974677118082,27538155,Neighborhood highlights,...,"[""Room-darkening shades"", ""Outdoor dining area...",85,206.152941,You simply can not beat the views at this extr...,2025,7,1,1,0,36


In [19]:
# CELL 8 – prepare text columns (fill NaN with empty strings)

text_cols = ["description", "neighborhood_overview", "amenities", "combined_text"]

for col in text_cols:
    df[col] = df[col].fillna("")

df[text_cols].head()


Unnamed: 0,description,neighborhood_overview,amenities,combined_text
0,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",You simply can not beat the views at this extr...
1,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",You simply can not beat the views at this extr...
2,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",You simply can not beat the views at this extr...
3,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",You simply can not beat the views at this extr...
4,Come get away with these incredible views at 3...,Extremely quiet neighborhood walk that offers ...,"[""Room-darkening shades"", ""Outdoor dining area...",You simply can not beat the views at this extr...


In [21]:
# CELL 9 – final checks

print("Final dataset shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())

# Optional: save intermediate version
# df.to_csv("airbnb_unified_features.csv", index=False)


print(df)

Final dataset shape: (365, 31)

Columns:
 ['listing_id', 'date', 'available', 'price_x', 'adjusted_price', 'minimum_nights', 'maximum_nights', 'id', 'host_id', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price_y', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365', 'last_review', 'description', 'neighborhood_overview', 'amenities', 'review_count', 'avg_review_length', 'combined_text', 'year', 'month', 'day', 'day_of_week', 'is_weekend', 'days_since_last_review']
             listing_id       date  available  price_x  adjusted_price  \
0    686088974677118082 2025-06-27          0      NaN             NaN   
1    686088974677118082 2025-06-28          0      NaN             NaN   
2    686088974677118082 2025-06-29          1      NaN             NaN   
3    686088974677118082 2025-06-30          1      NaN             NaN   
4    686088974677118082 2025-07-01          1      NaN             NaN   
..                  ...        ..