In [1]:
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
import os

sns.set_theme(style="whitegrid")
plt.rcParams["figure.figsize"] = (10, 5)

In [2]:
CENTER_NAME   = "RevenueCenter_1"
DATA_DIR      = "revenue_center_data"
FILE_PATH     = os.path.join(DATA_DIR, f"{CENTER_NAME}_data.csv")
LOW_IMPACT    = ["IsArtDubai", "IsFilmFestival", "IsWorldCup"]
FORECAST_DAYS = 90            # ≈ three months
VALID_START   = "2023-10-01"  # use last 3 months for validation


# Numeric encoding maps  ------------------------------------------
TI_MAP = {"Low": 0, "Normal": 1, "Medium": 2, "High": 3}
RI_MAP = {"Decrease": -1, "Neutral": 0, "Boost": 1}

In [3]:
FILE_PATH = os.path.join("..", DATA_DIR, f"{CENTER_NAME}_data.csv")
assert os.path.exists(FILE_PATH), f"CSV not found → {FILE_PATH}"
raw = pd.read_csv(FILE_PATH, parse_dates=["Date"])

raw.drop(columns=[c for c in LOW_IMPACT if c in raw.columns], inplace=True, errors="ignore")

for col in ["Date", "MealPeriod", "CheckTotal"]:
    if col not in raw.columns:
        raise KeyError(f"Missing column: {col}")

flag_cols = [c for c in raw.columns if c.startswith("Is") and c != "IslamicPeriod"]
for f in flag_cols:
    raw[f] = raw[f].fillna(0).astype(int)

raw["TourismIntensity"] = raw["TourismIntensity"].map(TI_MAP).fillna(1).astype(int)
raw["RevenueImpact"] = raw["RevenueImpact"].map(RI_MAP).fillna(0).astype(int)


print("✓ Pre-processing done →", raw.shape)
print(raw.head(2))

✓ Pre-processing done → (1458, 26)
        Date MealPeriod RevenueCenterName  DayOfWeek  Month  Year  CheckTotal  \
0 2023-01-01  Breakfast   RevenueCenter_1          6      1  2023      1499.4   
1 2023-01-01     Dinner   RevenueCenter_1          6      1  2023      4374.5   

   is_zero          IslamicPeriod  IsRamadan  ...  IsNationalDay  IsNewYear  \
0        0  New-Year-Celebrations          0  ...              0          1   
1        0  New-Year-Celebrations          0  ...              0          1   

   IsMarathon  IsGITEX  IsAirshow  IsFoodFestival  IsPreEvent  IsPostEvent  \
0           0        0          0               0           0            0   
1           0        0          0               0           0            0   

   TourismIntensity  RevenueImpact  
0                 3              1  
1                 3              1  

[2 rows x 26 columns]


In [4]:
# Simple check for zero values in CheckTotal
has_zeros = (raw['CheckTotal'] == 0).any()
zero_count = (raw['CheckTotal'] == 0).sum()

print(f"Contains zero values: {has_zeros}")
print(f"Number of zeros: {zero_count}")

Contains zero values: True
Number of zeros: 10


In [5]:
# Replace zero values with 30-day rolling average
print("Replacing zero values with 30-day rolling average...")

# Check if we need to group by specific columns
groupby_cols = []
if 'RevenueCenterName' in raw.columns:
    groupby_cols.append('RevenueCenterName')
if 'MealPeriod' in raw.columns:
    groupby_cols.append('MealPeriod')

# Sort by date for proper time series processing
raw = raw.sort_values(['Date'] + groupby_cols)

# Function to replace zeros with rolling average
def replace_zeros_with_average(series):
    # Create a copy to avoid modifying original
    result = series.copy()
    
    # Calculate 30-day rolling average (centered window)
    rolling_avg = series.rolling(window=30, center=True, min_periods=5).mean()
    
    # For edge cases, use backward-looking average
    rolling_avg_backward = series.rolling(window=30, min_periods=5).mean()
    
    # Fill NaN values in centered average with backward-looking average
    rolling_avg = rolling_avg.fillna(rolling_avg_backward)
    
    # Replace zeros with rolling average
    zero_mask = (result == 0)
    result[zero_mask] = rolling_avg[zero_mask]
    
    return result

# Apply the replacement
if groupby_cols:
    # Group by RevenueCenterName and MealPeriod if they exist
    raw['CheckTotal'] = raw.groupby(groupby_cols)['CheckTotal'].transform(replace_zeros_with_average)
else:
    # Apply to entire series if no grouping needed
    raw['CheckTotal'] = replace_zeros_with_average(raw['CheckTotal'])

# Check results
remaining_zeros = (raw['CheckTotal'] == 0).sum()
print(f"✓ Zero replacement complete")
print(f"✓ Remaining zeros: {remaining_zeros}")

# Handle any remaining zeros (edge cases) with forward/backward fill
if remaining_zeros > 0:
    if groupby_cols:
        raw['CheckTotal'] = raw.groupby(groupby_cols)['CheckTotal'].transform(
            lambda x: x.fillna(method='ffill').fillna(method='bfill')
        )
    else:
        raw['CheckTotal'] = raw['CheckTotal'].fillna(method='ffill').fillna(method='bfill')
    
    final_zeros = (raw['CheckTotal'] == 0).sum()
    print(f"✓ Final zeros after fill: {final_zeros}")

Replacing zero values with 30-day rolling average...
✓ Zero replacement complete
✓ Remaining zeros: 0


In [6]:
# Save preprocessed data to CSV
output_path = os.path.join("..", DATA_DIR, f"{CENTER_NAME}_preprocessed.csv")
raw.to_csv(output_path, index=False)

In [7]:
raw.head()

Unnamed: 0,Date,MealPeriod,RevenueCenterName,DayOfWeek,Month,Year,CheckTotal,is_zero,IslamicPeriod,IsRamadan,...,IsNationalDay,IsNewYear,IsMarathon,IsGITEX,IsAirshow,IsFoodFestival,IsPreEvent,IsPostEvent,TourismIntensity,RevenueImpact
0,2023-01-01,Breakfast,RevenueCenter_1,6,1,2023,1499.4,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
1,2023-01-01,Dinner,RevenueCenter_1,6,1,2023,4374.5,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
2,2023-01-01,Lunch,RevenueCenter_1,6,1,2023,1260.0,0,New-Year-Celebrations,0,...,0,1,0,0,0,0,0,0,3,1
3,2023-01-02,Breakfast,RevenueCenter_1,0,1,2023,771.0,0,Post-New-Year-Celebrations,0,...,0,1,0,0,0,0,0,1,1,0
4,2023-01-02,Dinner,RevenueCenter_1,0,1,2023,3460.0,0,Post-New-Year-Celebrations,0,...,0,1,0,0,0,0,0,1,1,0


In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

def transform_to_cnn_lstm_format(df):
    """
    Transform your current dataset to CNN-LSTM ready format
    """
    print("="*60)
    print("TRANSFORMING DATASET FOR CNN-LSTM MODEL")
    print("="*60)
    
    # Make a copy to avoid modifying original
    df_processed = df.copy()
    
    print(f"✓ Original shape: {df_processed.shape}")
    print(f"✓ Original columns: {len(df_processed.columns)}")
    
    # Step 1: Drop columns that won't be used as features
    columns_to_drop = ['Date', 'RevenueCenterName']
    
    # Keep CheckTotal separate as target - we'll process it differently
    target_data = df_processed[['Date', 'RevenueCenterName', 'MealPeriod', 'CheckTotal']].copy()
    
    print(f"\n✓ Dropping columns: {columns_to_drop}")
    df_processed = df_processed.drop(columns=columns_to_drop)
    
    # Step 2: One-hot encode categorical variables
    print("\n" + "="*40)
    print("CATEGORICAL ENCODING")
    print("="*40)
    
    # One-hot encode MealPeriod
    meal_dummies = pd.get_dummies(df_processed['MealPeriod'], prefix='Meal')
    print(f"✓ MealPeriod encoded to: {list(meal_dummies.columns)}")
    
    # One-hot encode IslamicPeriod 
    islamic_dummies = pd.get_dummies(df_processed['IslamicPeriod'], prefix='Event')
    print(f"✓ IslamicPeriod encoded to: {len(islamic_dummies.columns)} event columns")
    
    # One-hot encode TourismIntensity
    tourism_dummies = pd.get_dummies(df_processed['TourismIntensity'], prefix='Tourism')
    print(f"✓ TourismIntensity encoded to: {list(tourism_dummies.columns)}")
    
    # One-hot encode RevenueImpact
    impact_dummies = pd.get_dummies(df_processed['RevenueImpact'], prefix='Impact')
    print(f"✓ RevenueImpact encoded to: {list(impact_dummies.columns)}")
    
    # Drop original categorical columns
    df_processed = df_processed.drop(columns=['MealPeriod', 'IslamicPeriod', 'TourismIntensity', 'RevenueImpact'])
    
    # Step 3: Create cyclical features for temporal variables
    print("\n" + "="*40)
    print("CYCLICAL FEATURE ENCODING")
    print("="*40)
    
    # Month cyclical encoding (1-12)
    df_processed['Month_sin'] = np.sin(2 * np.pi * df_processed['Month'] / 12)
    df_processed['Month_cos'] = np.cos(2 * np.pi * df_processed['Month'] / 12)
    print("✓ Month encoded to cyclical features")
    
    # DayOfWeek cyclical encoding (0-6)
    df_processed['DayOfWeek_sin'] = np.sin(2 * np.pi * df_processed['DayOfWeek'] / 7)
    df_processed['DayOfWeek_cos'] = np.cos(2 * np.pi * df_processed['DayOfWeek'] / 7)
    print("✓ DayOfWeek encoded to cyclical features")
    
    # Drop original temporal columns
    df_processed = df_processed.drop(columns=['Month', 'DayOfWeek'])
    
    # Step 4: Combine all features
    print("\n" + "="*40)
    print("COMBINING ALL FEATURES")
    print("="*40)
    
    # Combine all one-hot encoded features
    df_final = pd.concat([
        df_processed,  # Numeric features
        meal_dummies,  # Meal period dummies
        islamic_dummies,  # Event dummies  
        tourism_dummies,  # Tourism intensity dummies
        impact_dummies  # Revenue impact dummies
    ], axis=1)
    
    print(f"✓ Combined shape: {df_final.shape}")
    print(f"✓ Total features: {len(df_final.columns)}")
    
    # Step 5: Scale all features
    print("\n" + "="*40)
    print("FEATURE SCALING")
    print("="*40)
    
    scaler = StandardScaler()
    df_scaled = pd.DataFrame(
        scaler.fit_transform(df_final),
        columns=df_final.columns,
        index=df_final.index
    )
    
    print("✓ All features scaled using StandardScaler")
    print(f"✓ Final dataset shape: {df_scaled.shape}")
    
    # Step 6: Show sample of transformed data
    print("\n" + "="*40)
    print("SAMPLE OF TRANSFORMED DATA")
    print("="*40)
    print("First 5 rows, first 10 columns:")
    print(df_scaled.iloc[:5, :10].round(3))
    
    print("\nColumn names (first 20):")
    print(list(df_scaled.columns[:20]))
    
    # Step 7: Feature summary
    print("\n" + "="*40)
    print("FEATURE BREAKDOWN")
    print("="*40)
    
    feature_counts = {
        'Numeric Features': len([col for col in df_scaled.columns if not any(prefix in col for prefix in ['Meal_', 'Event_', 'Tourism_', 'Impact_', '_sin', '_cos'])]),
        'Meal Features': len([col for col in df_scaled.columns if col.startswith('Meal_')]),
        'Event Features': len([col for col in df_scaled.columns if col.startswith('Event_')]),
        'Tourism Features': len([col for col in df_scaled.columns if col.startswith('Tourism_')]),
        'Impact Features': len([col for col in df_scaled.columns if col.startswith('Impact_')]),
        'Cyclical Features': len([col for col in df_scaled.columns if col.endswith(('_sin', '_cos'))])
    }
    
    for feature_type, count in feature_counts.items():
        print(f"✓ {feature_type}: {count}")
    
    print(f"\n✓ TOTAL FEATURES: {sum(feature_counts.values())}")
    
    return df_scaled, scaler, target_data

In [9]:
# Apply the transformation to your raw dataframe
df_transformed, scaler, target_data = transform_to_cnn_lstm_format(raw)

# Save the transformed dataset
df_transformed.to_csv('cnn_lstm_ready_dataset.csv', index=False)
print(f"\n✓ Transformed dataset saved as 'cnn_lstm_ready_dataset.csv'")

# Save the target data separately for sequence creation
target_data.to_csv('target_data_for_sequences.csv', index=False)
print(f"✓ Target data saved as 'target_data_for_sequences.csv'")

TRANSFORMING DATASET FOR CNN-LSTM MODEL
✓ Original shape: (1458, 26)
✓ Original columns: 26

✓ Dropping columns: ['Date', 'RevenueCenterName']

CATEGORICAL ENCODING
✓ MealPeriod encoded to: ['Meal_Breakfast', 'Meal_Dinner', 'Meal_Lunch']
✓ IslamicPeriod encoded to: 33 event columns
✓ TourismIntensity encoded to: ['Tourism_0', 'Tourism_1', 'Tourism_2', 'Tourism_3']
✓ RevenueImpact encoded to: ['Impact_-1', 'Impact_0', 'Impact_1']

CYCLICAL FEATURE ENCODING
✓ Month encoded to cyclical features
✓ DayOfWeek encoded to cyclical features

COMBINING ALL FEATURES
✓ Combined shape: (1458, 65)
✓ Total features: 65

FEATURE SCALING
✓ All features scaled using StandardScaler
✓ Final dataset shape: (1458, 65)

SAMPLE OF TRANSFORMED DATA
First 5 rows, first 10 columns:
    Year  CheckTotal  is_zero  IsRamadan  IsEid  IsPreRamadan  IsPostRamadan  \
0 -0.576       0.016   -0.083     -0.646 -0.185        -0.308         -0.308   
1 -0.576       2.073   -0.083     -0.646 -0.185        -0.308         -0.3