In [None]:
# ============================================================================
# STEP 1: PROPER DATA LOADING & INITIAL EXPLORATION
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

def load_and_explore_data():
    """
    Load all revenue center data and perform comprehensive exploration
    WITHOUT any premature transformations or outlier removal
    """
    print("="*80)
    print("STEP 1: LOADING AND EXPLORING RAW DATA")
    print("="*80)
    
    # Load all revenue centers
    revenue_centers = []
    for i in range(1, 10):
        df = pd.read_csv(f'../revenue_center_data/RevenueCenter_{i}_data.csv')
        df['RevenueCenterID'] = i
        revenue_centers.append(df)
    
    # Combine all revenue centers
    df_all = pd.concat(revenue_centers, ignore_index=True)
    df_all['Date'] = pd.to_datetime(df_all['Date'])
    
    print(f"✓ Loaded data for {len(revenue_centers)} revenue centers")
    print(f"✓ Total records: {len(df_all):,}")
    print(f"✓ Date range: {df_all['Date'].min()} to {df_all['Date'].max()}")
    print(f"✓ Total days: {(df_all['Date'].max() - df_all['Date'].min()).days + 1}")
    
    # Basic data quality checks
    print(f"\n📊 Data Quality Overview:")
    print(f"  Missing values: {df_all.isnull().sum().sum()}")
    print(f"  Duplicate records: {df_all.duplicated().sum()}")
    print(f"  Zero revenue records: {(df_all['CheckTotal'] == 0).sum()}")
    print(f"  Negative revenue records: {(df_all['CheckTotal'] < 0).sum()}")
    
    # Revenue distribution analysis (HONEST - no manipulation)
    print(f"\n💰 Revenue Distribution (RAW - NO MANIPULATION):")
    print(f"  Overall range: ${df_all['CheckTotal'].min():.2f} - ${df_all['CheckTotal'].max():.2f}")
    print(f"  Mean: ${df_all['CheckTotal'].mean():.2f}")
    print(f"  Median: ${df_all['CheckTotal'].median():.2f}")
    print(f"  Std: ${df_all['CheckTotal'].std():.2f}")
    
    # Per meal period analysis
    print(f"\n🍽️ By Meal Period (RAW):")
    for meal in ['Breakfast', 'Dinner', 'Lunch']:
        meal_data = df_all[df_all['MealPeriod'] == meal]['CheckTotal']
        print(f"  {meal}:")
        print(f"    Range: ${meal_data.min():.2f} - ${meal_data.max():.2f}")
        print(f"    Mean: ${meal_data.mean():.2f}, CV: {meal_data.std()/meal_data.mean():.3f}")
    
    # Temporal patterns
    print(f"\n📅 Temporal Patterns:")
    df_all['DayOfYear'] = df_all['Date'].dt.dayofyear
    df_all['WeekOfYear'] = df_all['Date'].dt.isocalendar().week
    
    # Check for seasonal patterns
    monthly_revenue = df_all.groupby(df_all['Date'].dt.month)['CheckTotal'].agg(['mean', 'std'])
    print(f"  Monthly revenue variation (CV): {(monthly_revenue['std'] / monthly_revenue['mean']).mean():.3f}")
    
    return df_all

# Execute data loading
df_raw = load_and_explore_data()

STEP 1: LOADING AND EXPLORING RAW DATA


FileNotFoundError: [Errno 2] No such file or directory: 'notebooks/revenue_center_data/RevenueCenter_1_data.csv'