In [2]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

In [3]:
CENTER_NAME   = "RevenueCenter_1"
DATA_DIR      = "revenue_center_data"
FILE_PATH     = os.path.join(DATA_DIR, f"{CENTER_NAME}_data.csv")
LOW_IMPACT    = ["IsArtDubai", "IsFilmFestival", "IsWorldCup"]
FORECAST_DAYS = 90            # ≈ three months
VALID_START   = "2023-10-01"  # use last 3 months for validation


# Numeric encoding maps  ------------------------------------------
TI_MAP = {"Low": 0, "Normal": 1, "Medium": 2, "High": 3}
RI_MAP = {"Decrease": -1, "Neutral": 0, "Boost": 1}

In [None]:
FILE_PATH = os.path.join("..", DATA_DIR, f"{CENTER_NAME}_data.csv")
assert os.path.exists(FILE_PATH), f"CSV not found → {FILE_PATH}"
raw = pd.read_csv(FILE_PATH, parse_dates=["Date"])

raw.drop(columns=[c for c in LOW_IMPACT if c in raw.columns], inplace=True, errors="ignore")

for col in ["Date", "MealPeriod", "CheckTotal"]:
    if col not in raw.columns:
        raise KeyError(f"Missing column: {col}")

flag_cols = [c for c in raw.columns if c.startswith("Is") and c != "IslamicPeriod"]
for f in flag_cols:
    raw[f] = raw[f].fillna(0).astype(int)

raw["TourismIntensity"] = raw["TourismIntensity"].map(TI_MAP).fillna(1).astype(int)
raw["RevenueImpact"] = raw["RevenueImpact"].map(RI_MAP).fillna(0).astype(int)


print("✓ Pre-processing done →", raw.shape)
print(raw.head(2))

✓ Pre-processing done → (1458, 26)
        Date MealPeriod RevenueCenterName  DayOfWeek  Month  Year  CheckTotal  \
0 2023-01-01  Breakfast   RevenueCenter_1          6      1  2023      1499.4   
1 2023-01-01     Dinner   RevenueCenter_1          6      1  2023      4374.5   

   is_zero          IslamicPeriod  IsRamadan  ...  IsNationalDay  IsNewYear  \
0        0  New-Year-Celebrations          0  ...              0          1   
1        0  New-Year-Celebrations          0  ...              0          1   

   IsMarathon  IsGITEX  IsAirshow  IsFoodFestival  IsPreEvent  IsPostEvent  \
0           0        0          0               0           0            0   
1           0        0          0               0           0            0   

   TourismIntensity  RevenueImpact  
0                 3              1  
1                 3              1  

[2 rows x 26 columns]


In [7]:
# Simple check for zero values in CheckTotal
has_zeros = (raw['CheckTotal'] == 0).any()
zero_count = (raw['CheckTotal'] == 0).sum()

print(f"Contains zero values: {has_zeros}")
print(f"Number of zeros: {zero_count}")

Contains zero values: True
Number of zeros: 10


In [8]:
# Replace zero values with 30-day rolling average
print("Replacing zero values with 30-day rolling average...")

# Check if we need to group by specific columns
groupby_cols = []
if 'RevenueCenterName' in raw.columns:
    groupby_cols.append('RevenueCenterName')
if 'MealPeriod' in raw.columns:
    groupby_cols.append('MealPeriod')

# Sort by date for proper time series processing
raw = raw.sort_values(['Date'] + groupby_cols)

# Function to replace zeros with rolling average
def replace_zeros_with_average(series):
    # Create a copy to avoid modifying original
    result = series.copy()
    
    # Calculate 30-day rolling average (centered window)
    rolling_avg = series.rolling(window=30, center=True, min_periods=5).mean()
    
    # For edge cases, use backward-looking average
    rolling_avg_backward = series.rolling(window=30, min_periods=5).mean()
    
    # Fill NaN values in centered average with backward-looking average
    rolling_avg = rolling_avg.fillna(rolling_avg_backward)
    
    # Replace zeros with rolling average
    zero_mask = (result == 0)
    result[zero_mask] = rolling_avg[zero_mask]
    
    return result

# Apply the replacement
if groupby_cols:
    # Group by RevenueCenterName and MealPeriod if they exist
    raw['CheckTotal'] = raw.groupby(groupby_cols)['CheckTotal'].transform(replace_zeros_with_average)
else:
    # Apply to entire series if no grouping needed
    raw['CheckTotal'] = replace_zeros_with_average(raw['CheckTotal'])

# Check results
remaining_zeros = (raw['CheckTotal'] == 0).sum()
print(f"✓ Zero replacement complete")
print(f"✓ Remaining zeros: {remaining_zeros}")

# Handle any remaining zeros (edge cases) with forward/backward fill
if remaining_zeros > 0:
    if groupby_cols:
        raw['CheckTotal'] = raw.groupby(groupby_cols)['CheckTotal'].transform(
            lambda x: x.fillna(method='ffill').fillna(method='bfill')
        )
    else:
        raw['CheckTotal'] = raw['CheckTotal'].fillna(method='ffill').fillna(method='bfill')
    
    final_zeros = (raw['CheckTotal'] == 0).sum()
    print(f"✓ Final zeros after fill: {final_zeros}")

Replacing zero values with 30-day rolling average...
✓ Zero replacement complete
✓ Remaining zeros: 0


In [9]:
# Save preprocessed data to CSV
output_path = os.path.join("..", DATA_DIR, f"{CENTER_NAME}_preprocessed.csv")
raw.to_csv(output_path, index=False)

In [10]:
raw.head()

Unnamed: 0,Date,MealPeriod,RevenueCenterName,DayOfWeek,Month,Year,CheckTotal,is_zero,IslamicPeriod,IsRamadan,...,IsNationalDay,IsNewYear,IsMarathon,IsGITEX,IsAirshow,IsFoodFestival,IsPreEvent,IsPostEvent,TourismIntensity,RevenueImpact
0,2023-01-01,Breakfast,RevenueCenter_1,6,1,2023,1499.4,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
1,2023-01-01,Dinner,RevenueCenter_1,6,1,2023,4374.5,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
2,2023-01-01,Lunch,RevenueCenter_1,6,1,2023,1260.0,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
3,2023-01-02,Breakfast,RevenueCenter_1,0,1,2023,771.0,0,Post-New-Year-Celebrations,0,...,0,1,0,0,0,0,0,1,1,0
4,2023-01-02,Dinner,RevenueCenter_1,0,1,2023,3460.0,0,Post-New-Year-Celebrations,0,...,0,1,0,0,0,0,0,1,1,0
