# Notebook 03: Feature Engineering (Fixed for Spark Connect ML)
Using direct transformations instead of Pipeline to avoid model size limits

## Import libraries

In [0]:
%python
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
import numpy as np

## Initialize with optimization


In [0]:
%python
spark = SparkSession.builder \
    .appName("Hotel_Churn_Feature_Engineering") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .getOrCreate()

## Load Dataset from Silver

In [0]:
%python
df = spark.table("hotel_catalog.silver.cleaned_hotel_bookings")
print(f"Total records: {df.count():,}")

##  Create features using direct SQL transformations (no Pipeline!)

In [0]:
%python
print("Creating features with direct transformations...")

# First, handle missing values with simple fillna
df = df.fillna({
    'children': 0,
    'agent': 'Unknown',
    'company': 'Unknown',
    'country': 'UNK'
})

## Create basic features

In [0]:
%python
df_features = df.withColumn(
    "total_nights",
    col("stays_in_weekend_nights") + col("stays_in_week_nights")
).withColumn(
    "total_guests",
    col("adults") + col("children") + col("babies")
).withColumn(
    "total_price",
    col("adr") * col("total_nights")
).withColumn(
    "is_weekend_stay",
    when(col("stays_in_weekend_nights") > 0, 1).otherwise(0)
).withColumn(
    "room_type_match",
    when(col("reserved_room_type") == col("assigned_room_type"), 1).otherwise(0)
).withColumn(
    "has_special_requests",
    when(col("total_of_special_requests") > 0, 1).otherwise(0)
)

## Create numerical features with manual scaling

In [0]:
%python
# Calculate min/max for scaling
stats = df_features.agg(
    min("lead_time").alias("min_lead_time"),
    max("lead_time").alias("max_lead_time"),
    min("adr").alias("min_adr"),
    max("adr").alias("max_adr"),
    min("total_nights").alias("min_nights"),
    max("total_nights").alias("max_nights"),
    min("previous_cancellations").alias("min_prev_cancel"),
    max("previous_cancellations").alias("max_prev_cancel")
).collect()[0]

# Apply manual min-max scaling
prev_cancel_range = stats["max_prev_cancel"] - stats["min_prev_cancel"]
if prev_cancel_range == 0:
    prev_cancel_range = 1  # avoid division by zero

df_features = df_features.withColumn(
    "lead_time_scaled",
    (col("lead_time") - stats["min_lead_time"]) / 
    (stats["max_lead_time"] - stats["min_lead_time"])
).withColumn(
    "adr_scaled",
    (col("adr") - stats["min_adr"]) / 
    (stats["max_adr"] - stats["min_adr"])
).withColumn(
    "total_nights_scaled",
    (col("total_nights") - stats["min_nights"]) / 
    (stats["max_nights"] - stats["min_nights"])
).withColumn(
    "prev_cancellations_scaled",
    (col("previous_cancellations") - stats["min_prev_cancel"]) / prev_cancel_range
)

## Encode categorical features manually (no StringIndexer/OneHotEncoder)

In [0]:
%python
# Manual encoding for hotel (binary)
df_features = df_features.withColumn(
    "hotel_encoded",
    when(col("hotel") == "Resort Hotel", 0).otherwise(1)
)

# Manual encoding for deposit_type (3 categories -> 2 binary columns)
df_features = df_features.withColumn(
    "deposit_no_deposit",
    when(col("deposit_type") == "No Deposit", 1).otherwise(0)
).withColumn(
    "deposit_non_refund",
    when(col("deposit_type") == "Non Refund", 1).otherwise(0)
)
# Note: "Refundable" is the reference category (both columns = 0)

# Manual encoding for customer_type (4 categories -> 3 binary columns)
df_features = df_features.withColumn(
    "customer_transient",
    when(col("customer_type") == "Transient", 1).otherwise(0)
).withColumn(
    "customer_contract",
    when(col("customer_type") == "Contract", 1).otherwise(0)
).withColumn(
    "customer_transient_party",
    when(col("customer_type") == "Transient-Party", 1).otherwise(0)
)
# Note: "Group" is reference category

## Create interaction features

In [0]:
%python
df_features = df_features.withColumn(
    "lead_time_x_adr",
    col("lead_time_scaled") * col("adr_scaled")
).withColumn(
    "total_nights_x_guests",
    col("total_nights_scaled") * (col("total_guests") / 10.0)  # scale guests
)

## Select final feature columns

In [0]:
%python
print("Selecting final feature columns...")

# Define all feature columns
feature_cols = [
    # Scaled numerical features
    "lead_time_scaled",
    "adr_scaled", 
    "total_nights_scaled",
    "prev_cancellations_scaled",
    
    # Binary/encoded categorical features
    "hotel_encoded",
    "deposit_no_deposit",
    "deposit_non_refund",
    "customer_transient",
    "customer_contract", 
    "customer_transient_party",
    
    # Binary features
    "is_weekend_stay",
    "room_type_match",
    "has_special_requests",
    "is_repeated_guest",
    
    # Interaction features
    "lead_time_x_adr",
    "total_nights_x_guests",
    
    # Raw features (already somewhat normalized)
    "booking_changes",
    "days_in_waiting_list",
    "total_of_special_requests"
]

print(f"Total features created: {len(feature_cols)}")

## Handle any remaining nulls

In [0]:
%python
for col_name in feature_cols:
    df_features = df_features.fillna({col_name: 0})

## Assemble features using VectorAssembler (this is lightweight)

In [0]:
%python
print("Assembling final feature vector...")

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features",
    handleInvalid="skip"
)

df_final = assembler.transform(df_features)

## Create final dataset with metadata

In [0]:
%python
final_columns = [
    "hotel", 
    "churn",
    "lead_time", 
    "total_nights",
    "adr",
    "deposit_type",
    "customer_type",
    "features"
]

df_final_model = df_final.select(final_columns)

# Add unique ID
df_final_model = df_final_model.withColumn(
    "booking_id",
    monotonically_increasing_id()
)

 Write Dataset to Gold layer

In [0]:
%python
print("Saving to Gold layer...")

gold_table_name = "hotel_catalog.gold.hotel_features_final"

df_final_model.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable(gold_table_name)

## Create feature metadata table for reference

In [0]:
%python
feature_metadata = spark.createDataFrame([
    ("lead_time_scaled", "Scaled lead time (0-1)", "Numerical"),
    ("adr_scaled", "Scaled average daily rate (0-1)", "Numerical"),
    ("total_nights_scaled", "Scaled total nights (0-1)", "Numerical"),
    ("prev_cancellations_scaled", "Scaled previous cancellations (0-1)", "Numerical"),
    ("hotel_encoded", "Hotel type (0=Resort, 1=City)", "Categorical"),
    ("deposit_no_deposit", "No deposit indicator", "Categorical"),
    ("deposit_non_refund", "Non-refundable deposit indicator", "Categorical"),
    ("customer_transient", "Transient customer indicator", "Categorical"),
    ("customer_contract", "Contract customer indicator", "Categorical"),
    ("customer_transient_party", "Transient-party customer indicator", "Categorical"),
    ("is_weekend_stay", "Includes weekend stay", "Binary"),
    ("room_type_match", "Room type matches reservation", "Binary"),
    ("has_special_requests", "Has special requests", "Binary"),
    ("is_repeated_guest", "Is repeated guest", "Binary"),
    ("lead_time_x_adr", "Lead time × ADR interaction", "Interaction"),
    ("total_nights_x_guests", "Nights × guests interaction", "Interaction"),
    ("booking_changes", "Number of booking changes", "Numerical"),
    ("days_in_waiting_list", "Days in waiting list", "Numerical"),
    ("total_of_special_requests", "Total special requests", "Numerical")
], ["feature_name", "description", "feature_type"])

feature_metadata.write \
    .format("delta") \
    .mode("overwrite") \
    .saveAsTable("hotel_catalog.gold.feature_metadata")

## Summary

In [0]:
%python
print("\n" + "="*60)
print("FEATURE ENGINEERING COMPLETE")
print("="*60)
print(f"Original dataset size: {df.count():,} records")
print(f"Features created: {len(feature_cols)}")
print(f"Feature vector dimension: {len(feature_cols)}")
print(f"Final table: {gold_table_name}")

# Display sample
print("\nSample of features created:")
sample_df = df_final_model.select(
    "hotel", "churn", "lead_time", "total_nights", "adr"
).limit(5).toPandas()
print(sample_df.to_string())

# Show feature list
print("\nFeatures created:")
for i, feature in enumerate(feature_cols, 1):
    print(f"{i:2d}. {feature}")

print("\nFeature engineering complete!")
print("   No Pipeline used - all transformations are direct SQL operations")
print("   Features are manually scaled and encoded to avoid model size limits")