## Step 1: Data Preparation & Initial Analysis

In [43]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import IsolationForest
import holidays

# Load data from an Excel file into a DataFrame
# - parse_dates=['Date']: Ensures that the 'Date' column is parsed as datetime objects
# - engine='openpyxl': Specifies the engine to use for reading the Excel file
df = pd.read_excel('../data/raw/Hotel_Revenue_Data.xlsx', parse_dates=['Date'], engine='openpyxl')

# Create temporal features directly in the main DataFrame
# Extract year, month, and day of the week from the 'Date' column
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.weekday  # Monday=0, Sunday=6
# Determine if the day is a weekend in the UAE (Friday and Saturday)
df['IsWeekend'] = df['Date'].dt.weekday >= 4

# Define all revenue centers as a list of strings
# This creates a list of revenue center names from 'RevenueCenter_1' to 'RevenueCenter_9'
all_revenue_centers = [f'RevenueCenter_{i}' for i in range(1, 10)]

# Create a complete index of all possible combinations of dates, meal periods, and revenue centers
# - date_range: Unique dates from the DataFrame
# - meal_periods: Unique meal periods from the DataFrame
date_range = df['Date'].unique()
meal_periods = df['MealPeriod'].unique()


# Create a multi-index that represents every possible combination of:
# - date_range: All unique dates in the dataset
# - meal_periods: All meal periods (Breakfast, Lunch, Dinner)
# - all_revenue_centers: All revenue centers (RevenueCenter_1 through RevenueCenter_9)
# This ensures we have a complete grid of all possible combinations, even if some don't exist in the data
# The resulting index will be used to identify missing data points and fill them with zeros
full_index = pd.MultiIndex.from_product(
    [date_range, meal_periods, all_revenue_centers],
    names=['Date', 'MealPeriod', 'RevenueCenterName']
)

# Create a base DataFrame with only the essential columns for merging
base_df = df[['Date', 'MealPeriod', 'RevenueCenterName', 'CheckTotal']]

# Create the complete DataFrame using merge instead of reindex
# - Reset the index of the DataFrame created from the multi-index
# - Merge with base_df to align data based on 'Date', 'MealPeriod', and 'RevenueCenterName'
# - how='left': Ensures all combinations in full_index are retained, filling missing data with NaN
df_complete = pd.DataFrame(index=full_index).reset_index()
print(df_complete)
df_complete = pd.merge(
    df_complete,
    base_df,
    on=['Date', 'MealPeriod', 'RevenueCenterName'],
    how='left'
)

# Fill missing values in 'CheckTotal' with 0
# This assumes that missing values in 'CheckTotal' imply zero revenue
df_complete['CheckTotal'] = df_complete['CheckTotal'].fillna(0)

# Add temporal features from unique dates to the complete DataFrame
# - drop_duplicates(): Ensures each date is unique in the date_features DataFrame
df_complete = pd.merge(
    df_complete,
    df[['Date', 'Year', 'Month', 'DayOfWeek', 'IsWeekend']].drop_duplicates(),
    on='Date',
    how='left'
)

# Calculate daily totals by grouping the complete DataFrame
# - Group by 'Date', 'MealPeriod', 'RevenueCenterName', 'DayOfWeek', 'Month', 'Year'
# - Sum the 'CheckTotal' for each group to get daily totals
df_totals = df_complete.groupby(
    ['Date', 'MealPeriod', 'RevenueCenterName', 'DayOfWeek', 'Month', 'Year'],
    as_index=False
)['CheckTotal'].sum()

# Print the first 20 rows of the resulting DataFrame to verify the results
df_totals.head(20)

            Date MealPeriod RevenueCenterName
0     2023-04-18     Dinner   RevenueCenter_1
1     2023-04-18     Dinner   RevenueCenter_2
2     2023-04-18     Dinner   RevenueCenter_3
3     2023-04-18     Dinner   RevenueCenter_4
4     2023-04-18     Dinner   RevenueCenter_5
...          ...        ...               ...
13117 2024-04-04  BreakFast   RevenueCenter_5
13118 2024-04-04  BreakFast   RevenueCenter_6
13119 2024-04-04  BreakFast   RevenueCenter_7
13120 2024-04-04  BreakFast   RevenueCenter_8
13121 2024-04-04  BreakFast   RevenueCenter_9

[13122 rows x 3 columns]


Unnamed: 0,Date,MealPeriod,RevenueCenterName,DayOfWeek,Month,Year,CheckTotal
0,2023-01-01,BreakFast,RevenueCenter_1,6,1,2023,1499.4
1,2023-01-01,BreakFast,RevenueCenter_2,6,1,2023,35.0
2,2023-01-01,BreakFast,RevenueCenter_3,6,1,2023,0.0
3,2023-01-01,BreakFast,RevenueCenter_4,6,1,2023,0.0
4,2023-01-01,BreakFast,RevenueCenter_5,6,1,2023,21807.0
5,2023-01-01,BreakFast,RevenueCenter_6,6,1,2023,93.0
6,2023-01-01,BreakFast,RevenueCenter_7,6,1,2023,0.0
7,2023-01-01,BreakFast,RevenueCenter_8,6,1,2023,0.0
8,2023-01-01,BreakFast,RevenueCenter_9,6,1,2023,0.0
9,2023-01-01,Dinner,RevenueCenter_1,6,1,2023,4374.5


In [42]:
print(full_index)

MultiIndex([('2023-04-18',    'Dinner', 'RevenueCenter_1'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_2'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_3'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_4'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_5'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_6'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_7'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_8'),
            ('2023-04-18',    'Dinner', 'RevenueCenter_9'),
            ('2023-04-18',     'Lunch', 'RevenueCenter_1'),
            ...
            ('2024-04-04',     'Lunch', 'RevenueCenter_9'),
            ('2024-04-04', 'BreakFast', 'RevenueCenter_1'),
            ('2024-04-04', 'BreakFast', 'RevenueCenter_2'),
            ('2024-04-04', 'BreakFast', 'RevenueCenter_3'),
            ('2024-04-04', 'BreakFast', 'RevenueCenter_4'),
            ('2024-04-04', 'BreakFast', 'RevenueCenter_5'),
            ('2024-04-04

## Step 2: Zero Value Identification

In [44]:
import pandas as pd
import numpy as np
import holidays

def analyze_zeros(df_totals):
    print("\n" + "="*50)
    print("1. Normalizing meal period names...")
    # Standardize meal period names
    df_totals['MealPeriod'] = df_totals['MealPeriod'].str.strip().str.title()
    print(f"Unique meal periods: {df_totals['MealPeriod'].unique()}")
    
    # --- Data Validation ---
    print("\n" + "="*50)
    print("2. Data integrity checks...")
    required_columns = ['CheckTotal', 'RevenueCenterName', 'MealPeriod', 'Date']
    missing_cols = [col for col in required_columns if col not in df_totals.columns]
    if missing_cols:
        raise KeyError(f"Missing critical columns: {missing_cols}")
    print("✓ All required columns present")
    
    # --- Temporal Analysis ---
    print("\n" + "="*50)
    print("3. Performing temporal analysis...")
    
    # Create a zero-revenue indicator
    df_totals['is_zero'] = (df_totals['CheckTotal'] == 0).astype(int)
    
    # Calculate zero statistics by revenue center and meal period
    zero_analysis = df_totals.groupby(['RevenueCenterName', 'MealPeriod']).agg(
        zero_days=('is_zero', 'sum'),
        total_days=('is_zero', 'count'),
    ).reset_index()
    
    # Calculate zero percentage
    zero_analysis['zero_pct'] = zero_analysis['zero_days'] / zero_analysis['total_days'] * 100
    
    # --- Pivot Table Creation ---
    print("\n" + "="*50)
    print("4. Creating zero-revenue summary table...")
    
    # Create pivot table showing zero percentages
    pivot_table = zero_analysis.pivot_table(
        values='zero_pct',
        index='RevenueCenterName',
        columns='MealPeriod',
        fill_value=0
    )
    
    # Ensure all expected meal periods are present
    expected_meals = ['Breakfast', 'Lunch', 'Dinner']
    for meal in expected_meals:
        if meal not in pivot_table.columns:
            pivot_table[meal] = 0
    
    # Reorder columns
    pivot_table = pivot_table[expected_meals]
    
    # --- Key Observations ---
    print("\n" + "="*50)
    print("5. Key Observations:")
    
    # 1. Revenue centers with highest zero rates
    print("\nRevenue Centers with Highest Zero-Revenue Rates:")
    for meal in expected_meals:
        if meal in zero_analysis['MealPeriod'].unique():
            max_center = zero_analysis[zero_analysis['MealPeriod'] == meal].nlargest(1, 'zero_pct')
            name = max_center['RevenueCenterName'].values[0]
            pct = max_center['zero_pct'].values[0]
            print(f"- {meal}: {name} ({pct:.1f}% zero days)")
    
    # 2. Most reliable revenue centers
    print("\nMost Reliable Revenue Centers (Lowest Zero Rates):")
    for meal in expected_meals:
        if meal in zero_analysis['MealPeriod'].unique():
            min_center = zero_analysis[zero_analysis['MealPeriod'] == meal].nsmallest(1, 'zero_pct')
            name = min_center['RevenueCenterName'].values[0]
            pct = min_center['zero_pct'].values[0]
            print(f"- {meal}: {name} ({pct:.1f}% zero days)")
    
    # 3. Overall zero statistics
    overall_zero_pct = (df_totals['CheckTotal'] == 0).mean() * 100
    print(f"\nOverall Zero-Revenue Percentage: {overall_zero_pct:.1f}%")
    
    return zero_analysis, pivot_table


# =============================================================================
# Main Execution Pipeline
# =============================================================================
if __name__ == "__main__":
    # Load and preprocess data (from previous steps)
    # [Your data loading and preprocessing code here]
    
    print("\n" + "="*50)
    print("INITIALIZING ZERO-REVENUE ANALYSIS")
    print(f"Date range: {df_totals['Date'].min().date()} to {df_totals['Date'].max().date()}")
    print(f"Total records: {len(df_totals):,}")
    
    try:
        zero_analysis, zero_summary = analyze_zeros(df_totals)
        
        print("\n" + "="*50)
        print("ANALYSIS COMPLETE")
        
        print("\nZero Analysis Summary:")
        print(zero_analysis.sort_values(['MealPeriod', 'zero_pct'], ascending=[True, False]))
        
        print("\nZero-Revenue Percentage by Center and Meal Period:")
        print(zero_summary)
        
    except Exception as e:
        print("\n" + "="*50)
        print("❌ ANALYSIS FAILED")
        print(f"Error: {str(e)}")
        print("\nTroubleshooting Tips:")
        print("- Verify meal period values: should include Breakfast, Lunch, Dinner")
        print("- Check for missing columns in input data")
        print("- Ensure date format is correct")
        print("- Confirm revenue center names are consistent")


INITIALIZING ZERO-REVENUE ANALYSIS
Date range: 2023-01-01 to 2024-04-30
Total records: 13,122

1. Normalizing meal period names...
Unique meal periods: ['Breakfast' 'Dinner' 'Lunch']

2. Data integrity checks...
✓ All required columns present

3. Performing temporal analysis...

4. Creating zero-revenue summary table...

5. Key Observations:

Revenue Centers with Highest Zero-Revenue Rates:
- Breakfast: RevenueCenter_4 (100.0% zero days)
- Lunch: RevenueCenter_4 (100.0% zero days)
- Dinner: RevenueCenter_4 (99.8% zero days)

Most Reliable Revenue Centers (Lowest Zero Rates):
- Breakfast: RevenueCenter_5 (0.0% zero days)
- Lunch: RevenueCenter_3 (0.0% zero days)
- Dinner: RevenueCenter_1 (0.0% zero days)

Overall Zero-Revenue Percentage: 31.3%

ANALYSIS COMPLETE

Zero Analysis Summary:
   RevenueCenterName MealPeriod  zero_days  total_days    zero_pct
9    RevenueCenter_4  Breakfast        486         486  100.000000
21   RevenueCenter_8  Breakfast        447         486   91.975309
15