# Hotel Booking Cancellation - Exploratory Data Analysis

This notebook performs comprehensive EDA on the hotel bookings dataset to understand patterns and inform feature engineering decisions.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)
%matplotlib inline

## 1. Data Loading and Overview

In [None]:
# Load data
df = pd.read_csv('../data/raw/Hotel_Reservations.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nColumns: {len(df.columns)}")
print(f"Rows: {len(df)}")

In [None]:
# First few rows
df.head()

In [None]:
# Data types and info
df.info()

In [None]:
# Summary statistics
df.describe()

## 2. Data Quality Assessment

In [None]:
# Missing values
missing = df.isnull().sum()
print("Missing Values:")
print(missing[missing > 0])
print(f"\nTotal missing values: {missing.sum()}")

In [None]:
# Duplicates
print(f"Duplicate rows: {df.duplicated().sum()}")
print(f"Duplicate Booking IDs: {df['Booking_ID'].duplicated().sum()}")

## 3. Target Variable Analysis

In [None]:
# Target distribution
target_counts = df['booking_status'].value_counts()
print("Booking Status Distribution:")
print(target_counts)
print(f"\nClass Balance:")
print(df['booking_status'].value_counts(normalize=True) * 100)

In [None]:
# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
target_counts.plot(kind='bar', ax=axes[0], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Booking Status Distribution', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Status')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Pie chart
axes[1].pie(target_counts, labels=target_counts.index, autopct='%1.1f%%', 
            colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Booking Status Proportion', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Numeric Features Analysis

In [None]:
# Select numeric columns
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f"Numeric features: {len(numeric_cols)}")
print(numeric_cols)

In [None]:
# Distribution of key numeric features
key_features = ['lead_time', 'avg_price_per_room', 'no_of_adults', 
                'no_of_weekend_nights', 'no_of_week_nights', 'no_of_special_requests']

fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(key_features):
    axes[idx].hist(df[col], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{col} Distribution', fontweight='bold')
    axes[idx].set_xlabel(col)
    axes[idx].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Box plots for outlier detection
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

axes[0].boxplot(df['lead_time'])
axes[0].set_title('Lead Time - Outlier Detection', fontweight='bold')
axes[0].set_ylabel('Days')

axes[1].boxplot(df['avg_price_per_room'])
axes[1].set_title('Average Price Per Room - Outlier Detection', fontweight='bold')
axes[1].set_ylabel('Price')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(14, 10))
correlation = df[numeric_cols].corr()
sns.heatmap(correlation, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1)
plt.title('Correlation Matrix - Numeric Features', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.show()

## 5. Categorical Features Analysis

In [None]:
# Categorical features
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
categorical_cols.remove('Booking_ID')  # Remove ID
categorical_cols.remove('booking_status')  # Remove target

print(f"Categorical features: {len(categorical_cols)}")
print(categorical_cols)

In [None]:
# Distribution of categorical features
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Meal plan
df['type_of_meal_plan'].value_counts().plot(kind='bar', ax=axes[0], color='skyblue')
axes[0].set_title('Meal Plan Distribution', fontweight='bold')
axes[0].set_xlabel('Meal Plan Type')
axes[0].set_ylabel('Count')
axes[0].tick_params(axis='x', rotation=0)

# Room type
df['room_type_reserved'].value_counts().plot(kind='bar', ax=axes[1], color='lightcoral')
axes[1].set_title('Room Type Distribution', fontweight='bold')
axes[1].set_xlabel('Room Type')
axes[1].set_ylabel('Count')
axes[1].tick_params(axis='x', rotation=0)

# Market segment
df['market_segment_type'].value_counts().plot(kind='bar', ax=axes[2], color='lightgreen')
axes[2].set_title('Market Segment Distribution', fontweight='bold')
axes[2].set_xlabel('Market Segment')
axes[2].set_ylabel('Count')
axes[2].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 6. Relationship with Target Variable

In [None]:
# Cancellation rate by categorical features
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, col in enumerate(['type_of_meal_plan', 'room_type_reserved', 'market_segment_type']):
    cancel_rate = df.groupby(col)['booking_status'].apply(
        lambda x: (x == 'Canceled').sum() / len(x) * 100
    ).sort_values(ascending=False)
    
    cancel_rate.plot(kind='bar', ax=axes[idx], color='salmon')
    axes[idx].set_title(f'Cancellation Rate by {col}', fontweight='bold')
    axes[idx].set_ylabel('Cancellation Rate (%)')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].axhline(y=df['booking_status'].value_counts(normalize=True)['Canceled'] * 100, 
                      color='red', linestyle='--', label='Overall Rate')
    axes[idx].legend()

plt.tight_layout()
plt.show()

In [None]:
# Numeric features vs target
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes = axes.ravel()

for idx, col in enumerate(['lead_time', 'avg_price_per_room', 'no_of_special_requests',
                            'no_of_adults', 'no_of_weekend_nights', 'no_of_week_nights']):
    df.boxplot(column=col, by='booking_status', ax=axes[idx])
    axes[idx].set_title(f'{col} by Booking Status')
    axes[idx].set_xlabel('Booking Status')
    axes[idx].set_ylabel(col)
    plt.sca(axes[idx])
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## 7. Temporal Patterns

In [None]:
# Bookings by month
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# Total bookings by month
monthly_bookings = df['arrival_month'].value_counts().sort_index()
monthly_bookings.plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Bookings by Arrival Month', fontweight='bold')
axes[0].set_xlabel('Month')
axes[0].set_ylabel('Number of Bookings')

# Cancellation rate by month
monthly_cancel = df.groupby('arrival_month')['booking_status'].apply(
    lambda x: (x == 'Canceled').sum() / len(x) * 100
)
monthly_cancel.plot(kind='bar', ax=axes[1], color='coral')
axes[1].set_title('Cancellation Rate by Arrival Month', fontweight='bold')
axes[1].set_xlabel('Month')
axes[1].set_ylabel('Cancellation Rate (%)')
axes[1].axhline(y=df['booking_status'].value_counts(normalize=True)['Canceled'] * 100,
                color='red', linestyle='--', label='Overall Rate')
axes[1].legend()

plt.tight_layout()
plt.show()

## 8. Guest Behavior Analysis

In [None]:
# Repeated guests analysis
repeated_analysis = df.groupby('repeated_guest')['booking_status'].value_counts(normalize=True).unstack() * 100

repeated_analysis.plot(kind='bar', figsize=(10, 6), color=['#2ecc71', '#e74c3c'])
plt.title('Booking Status by Repeated Guest Status', fontsize=14, fontweight='bold')
plt.xlabel('Repeated Guest (0=No, 1=Yes)')
plt.ylabel('Percentage')
plt.legend(title='Booking Status')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Special requests impact
df['has_special_requests'] = df['no_of_special_requests'] > 0

special_req_analysis = df.groupby('has_special_requests')['booking_status'].value_counts(normalize=True).unstack() * 100

special_req_analysis.plot(kind='bar', figsize=(10, 6), color=['#2ecc71', '#e74c3c'])
plt.title('Booking Status by Special Requests', fontsize=14, fontweight='bold')
plt.xlabel('Has Special Requests')
plt.ylabel('Percentage')
plt.legend(title='Booking Status')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

## 9. Key Insights Summary

### Data Quality
- ✅ No missing values
- ✅ No duplicate records
- ✅ Data types are appropriate

### Target Variable
- 67% Not Canceled, 33% Canceled (imbalanced - will use SMOTE)

### Important Patterns
1. **Lead Time**: Strong predictor - longer lead times show different cancellation patterns
2. **Special Requests**: Customers with special requests are less likely to cancel
3. **Repeated Guests**: Loyal customers have lower cancellation rates
4. **Price**: Higher room prices correlate with cancellations
5. **Market Segment**: Different acquisition channels show varying cancellation rates

### Outliers Detected
- Lead time has extreme values (will apply winsorization)
- Average price per room has outliers (will cap at percentiles)

### Feature Engineering Opportunities
1. Total stay nights (weekend + weekday)
2. Total guests (adults + children)
3. Price per guest
4. Lead time categories
5. Weekend booking flag
6. Special requests flag
7. Peak season indicator
8. Loyalty indicators
