# Hotel Booking Demand - Feature Engineering

This notebook creates features from the hotel booking dataset to prepare for machine learning.

**Goal**: Transform raw data into meaningful features for cancellation prediction

## 1. Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

Libraries imported successfully!


## 2. Load Data

In [2]:
# Load the explored dataset
df = pd.read_csv('data/hotel_bookings_explored.csv')
print(f"Dataset loaded: {df.shape}")
df.head()

Dataset loaded: (119390, 34)


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date,total_guests,total_nights
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,,0,Transient,0.0,0,0,Check-Out,2015-07-01,2.0,0
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,,0,Transient,0.0,0,0,Check-Out,2015-07-01,2.0,0
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1.0,1
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,,0,Transient,75.0,0,0,Check-Out,2015-07-02,1.0,1
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,,0,Transient,98.0,0,1,Check-Out,2015-07-03,2.0,2


## 3. Handle Missing Values

In [3]:
# Fill missing children with 0
if 'children' in df.columns:
    df['children'].fillna(0, inplace=True)

# Fill missing country with 'Unknown'
if 'country' in df.columns:
    df['country'].fillna('Unknown', inplace=True)

# Fill missing agent with 0
if 'agent' in df.columns:
    df['agent'].fillna(0, inplace=True)

# Fill missing company with 0
if 'company' in df.columns:
    df['company'].fillna(0, inplace=True)

print("✓ Missing values handled")
print(f"Remaining missing values: {df.isnull().sum().sum()}")

✓ Missing values handled
Remaining missing values: 4


## 4. Feature Engineering - Temporal Features

In [4]:
# Map months to numbers
month_map = {
    'January': 1, 'February': 2, 'March': 3, 'April': 4,
    'May': 5, 'June': 6, 'July': 7, 'August': 8,
    'September': 9, 'October': 10, 'November': 11, 'December': 12
}
df['arrival_month_num'] = df['arrival_date_month'].map(month_map)

# Create season feature
def get_season(month):
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    else:
        return 'Fall'

df['season'] = df['arrival_month_num'].apply(get_season)

print("✓ Temporal features created")
print(f"  - arrival_month_num")
print(f"  - season")

✓ Temporal features created
  - arrival_month_num
  - season


## 5. Feature Engineering - Booking Features

In [5]:
# Total stay in nights (if not already created)
if 'total_nights' not in df.columns:
    df['total_nights'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

# Total guests (if not already created)
if 'total_guests' not in df.columns:
    df['total_guests'] = df['adults'] + df['children'] + df['babies']

# Has children flag
df['has_children'] = (df['children'] > 0).astype(int)

# Has babies flag
df['has_babies'] = (df['babies'] > 0).astype(int)

# Special requests flag
df['has_special_requests'] = (df['total_of_special_requests'] > 0).astype(int)

# Is repeated guest flag already exists in original data

print("✓ Booking features created")
print(f"  - total_nights: {df['total_nights'].mean():.2f} avg")
print(f"  - total_guests: {df['total_guests'].mean():.2f} avg")
print(f"  - has_children: {df['has_children'].sum()} bookings")
print(f"  - has_babies: {df['has_babies'].sum()} bookings")
print(f"  - has_special_requests: {df['has_special_requests'].sum()} bookings")

✓ Booking features created
  - total_nights: 3.43 avg
  - total_guests: 1.97 avg
  - has_children: 8590 bookings
  - has_babies: 917 bookings
  - has_special_requests: 49072 bookings


## 6. Select Features for Modeling

In [6]:
# Select relevant features
feature_columns = [
    'hotel', 'lead_time', 'arrival_month_num', 'season',
    'stays_in_weekend_nights', 'stays_in_week_nights', 'total_nights',
    'adults', 'children', 'babies', 'total_guests',
    'meal', 'market_segment', 'distribution_channel',
    'is_repeated_guest', 'previous_cancellations',
    'previous_bookings_not_canceled', 'reserved_room_type',
    'assigned_room_type', 'booking_changes', 'deposit_type',
    'days_in_waiting_list', 'customer_type', 'adr',
    'required_car_parking_spaces', 'total_of_special_requests',
    'has_children', 'has_babies', 'has_special_requests'
]

# Keep only features that exist
feature_columns = [col for col in feature_columns if col in df.columns]

X = df[feature_columns].copy()
y = df['is_canceled'].copy()

print(f"✓ Selected {len(feature_columns)} features")
print(f"✓ Target variable: is_canceled")
print(f"✓ Dataset shape: {X.shape}")

✓ Selected 29 features
✓ Target variable: is_canceled
✓ Dataset shape: (119390, 29)


## 7. Encode Categorical Variables

In [7]:
# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"Encoding {len(categorical_cols)} categorical columns:")
print(categorical_cols)

# Encode categorical variables
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    encoders[col] = le

print(f"\n✓ All categorical variables encoded")
print(f"✓ Encoders saved: {len(encoders)}")

Encoding 9 categorical columns:
['hotel', 'season', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type']



✓ All categorical variables encoded
✓ Encoders saved: 9


## 8. Train-Test Split

In [8]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train-Test Split:")
print("=" * 50)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"\nTraining set cancellation rate: {y_train.mean()*100:.2f}%")
print(f"Test set cancellation rate: {y_test.mean()*100:.2f}%")

Train-Test Split:
Training set: 95512 samples
Test set: 23878 samples

Training set cancellation rate: 37.04%
Test set cancellation rate: 37.04%


## 9. Feature Scaling

In [9]:
# Scale numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns, index=X_test.index)

print("✓ Features scaled using StandardScaler")
print(f"✓ Training set shape: {X_train_scaled.shape}")
print(f"✓ Test set shape: {X_test_scaled.shape}")

✓ Features scaled using StandardScaler
✓ Training set shape: (95512, 29)
✓ Test set shape: (23878, 29)


## 10. Save Processed Data and Artifacts

In [10]:
# Save datasets
X_train_scaled.to_csv('data/X_train.csv', index=False)
X_test_scaled.to_csv('data/X_test.csv', index=False)
y_train.to_csv('data/y_train.csv', index=False, header=True)
y_test.to_csv('data/y_test.csv', index=False, header=True)

# Save scaler and encoders
joblib.dump(scaler, 'artifacts/scaler.joblib')
joblib.dump(encoders, 'artifacts/encoders.joblib')

# Save feature names
feature_names = X_train.columns.tolist()
joblib.dump(feature_names, 'artifacts/feature_names.joblib')

print("✓ Data saved successfully!")
print("\nSaved files:")
print("  - data/X_train.csv")
print("  - data/X_test.csv")
print("  - data/y_train.csv")
print("  - data/y_test.csv")
print("  - artifacts/scaler.joblib")
print("  - artifacts/encoders.joblib")
print("  - artifacts/feature_names.joblib")
print(f"\n✓ Ready for model training with {len(feature_names)} features!")

✓ Data saved successfully!

Saved files:
  - data/X_train.csv
  - data/X_test.csv
  - data/y_train.csv
  - data/y_test.csv
  - artifacts/scaler.joblib
  - artifacts/encoders.joblib
  - artifacts/feature_names.joblib

✓ Ready for model training with 29 features!
