In [None]:
"""
Financial Close Agent - Complete Pipeline
Processes Raw GL Export through all 10 tasks without human intervention
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION AND SETUP
# ============================================================================

class Config:
    """Configuration settings for the agent"""
    RAW_DATA_PATH = "Raw_GL_Export.csv"
    MASTER_DATA_PATH = "Master_Data/"
    REFERENCE_PATH = "Reference/"
    BUDGET_PATH = "Budget/"
    OUTPUT_PATH = "working/"
    REPORTS_PATH = "reports/"
    
    # Fiscal period settings
    CURRENT_FISCAL_PERIOD = "2026-02"
    CURRENT_MONTH = 2
    CURRENT_YEAR = 2026
    
    # Anomaly thresholds
    HIGH_VALUE_THRESHOLD = 50000
    EXTREME_OUTLIER_MULTIPLIER = 5
    SUSPICIOUS_HOUR_START = 22
    SUSPICIOUS_HOUR_END = 6

# ============================================================================
# T001: WRANGLE RAW GL DATA
# ============================================================================

class T001_DataWrangler:
    """Task 1: Parse and standardize raw GL export data"""
    
    def __init__(self):
        self.raw_df = None
        self.standardized_df = None
        self.anomaly_log = []
        
    def load_raw_data(self, filepath):
        """Load raw CSV file"""
        print("üìÇ T001: Loading raw GL data...")
        self.raw_df = pd.read_csv(filepath)
        print(f"   Loaded {len(self.raw_df)} rows")
        return self
    
    def standardize_column_names(self):
        """Convert column names to snake_case"""
        column_mapping = {
            'Txn_ID': 'transaction_id',
            'Posting_Date_Raw': 'posting_date_raw',
            'Invoice_Date_Raw': 'invoice_date_raw',
            'Fiscal_Period': 'fiscal_period',
            'Entity': 'entity_code',
            'Account_Code_Raw': 'account_code_raw',
            'Cost_Center_Raw': 'cost_center_raw',
            'Vendor_Name_Raw': 'vendor_name_raw',
            'Invoice_Number': 'invoice_number',
            'PO_Number': 'po_number',
            'Currency': 'currency_code',
            'Amount': 'amount_raw',
            'Tax_Code': 'tax_code',
            'Narrative': 'narrative',
            'Source_System': 'source_system'
        }
        self.standardized_df = self.raw_df.rename(columns=column_mapping)
        print("   ‚úì Column names standardized")
        return self
    
    def standardize_dates(self):
        """Convert all dates to consistent format YYYY-MM-DD"""
        df = self.standardized_df
        
        def parse_date(date_str, txn_id, column_name):
            if pd.isna(date_str) or date_str in ['INVALID', '99/99/9999', '32/13/2026', '2026-13-45']:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_DATE',
                    'severity': 'CRITICAL',
                    'description': f"Invalid date value: {date_str}",
                    'column': column_name
                })
                return None
            
            # Try different date formats
            formats = [
                '%d-%m-%Y', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y',
                '%d/%m/%y', '%m/%d/%y', '%d-%m-%y', '%y-%m-%d'
            ]
            
            for fmt in formats:
                try:
                    return datetime.strptime(str(date_str), fmt)
                except:
                    continue
            
            # If all formats fail
            self.anomaly_log.append({
                'transaction_id': txn_id,
                'anomaly_type': 'UNPARSABLE_DATE',
                'severity': 'CRITICAL',
                'description': f"Cannot parse date: {date_str}",
                'column': column_name
            })
            return None
        
        # Apply date parsing with transaction_id
        df['posting_date'] = df.apply(
            lambda row: parse_date(row['posting_date_raw'], row['transaction_id'], 'posting_date_raw'), 
            axis=1
        )
        df['invoice_date'] = df.apply(
            lambda row: parse_date(row['invoice_date_raw'], row['transaction_id'], 'invoice_date_raw'), 
            axis=1
        )
        
        # Extract fiscal year and month
        df['fiscal_year'] = df['fiscal_period'].str[:4]
        df['fiscal_month'] = df['fiscal_period'].str[-2:]
        
        # Check fiscal period consistency
        for idx, row in df.iterrows():
            if pd.notna(row['posting_date']):
                posting_month = row['posting_date'].month
                fiscal_month = int(row['fiscal_month']) if pd.notna(row['fiscal_month']) else None
                
                if fiscal_month and posting_month != fiscal_month:
                    self.anomaly_log.append({
                        'transaction_id': row['transaction_id'],
                        'anomaly_type': 'FISCAL_PERIOD_MISMATCH',
                        'severity': 'HIGH',
                        'description': f"Posting date month ({posting_month}) != fiscal period month ({fiscal_month})",
                        'posting_date': row['posting_date'],
                        'fiscal_period': row['fiscal_period']
                    })
        
        print(f"   ‚úì Dates standardized. Invalid dates: {sum(df['posting_date'].isna())}")
        return self
    
    def clean_amounts(self):
        """Convert amount strings to floats"""
        df = self.standardized_df
        
        def parse_amount(amt_str, txn_id):
            if pd.isna(amt_str):
                return None
            
            # Remove currency symbols, commas, spaces
            cleaned = str(amt_str).replace('$', '').replace(',', '').strip()
            
            # Handle negative numbers in parentheses
            if cleaned.startswith('(') and cleaned.endswith(')'):
                cleaned = '-' + cleaned[1:-1]
            
            try:
                return float(cleaned)
            except:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_AMOUNT',
                    'severity': 'HIGH',
                    'description': f"Cannot parse amount: {amt_str}"
                })
                return None
        
        df['amount'] = df.apply(
            lambda row: parse_amount(row['amount_raw'], row['transaction_id']), 
            axis=1
        )
        
        # Flag negative amounts
        df['amount_is_negative'] = df['amount'] < 0
        for idx, row in df[df['amount_is_negative']].iterrows():
            self.anomaly_log.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'NEGATIVE_AMOUNT',
                'severity': 'MEDIUM',
                'description': f"Negative amount: {row['amount']}",
                'amount': row['amount']
            })
        
        print(f"   ‚úì Amounts cleaned. Negative amounts: {df['amount_is_negative'].sum()}")
        return self
    
    def detect_embedded_exceptions(self):
        """Look for obvious exceptions in raw data"""
        df = self.standardized_df
        keywords = ['error', 'flag', 'review', 'urgent', 'exception', 'invalid']
        
        df['narrative_lower'] = df['narrative'].str.lower().fillna('')
        
        for idx, row in df.iterrows():
            # Check narrative for keywords
            if any(keyword in str(row['narrative_lower']) for keyword in keywords):
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'NARRATIVE_SUGGESTS_EXCEPTION',
                    'severity': 'MEDIUM',
                    'description': f"Narrative contains exception keywords: {row['narrative']}",
                    'narrative': row['narrative']
                })
            
            # Check for placeholder vendor names
            if row['vendor_name_raw'] in ['Unlisted Company', 'Unknown Vendor LLC', 
                                           'New Vendor XYZ', 'Unregistered Supplier', 
                                           'Mystery Corp']:
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'PLACEHOLDER_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Placeholder vendor name: {row['vendor_name_raw']}",
                    'vendor': row['vendor_name_raw']
                })
        
        print(f"   ‚úì Embedded exceptions detected: {len([a for a in self.anomaly_log if a['anomaly_type'] == 'NARRATIVE_SUGGESTS_EXCEPTION'])}")
        return self
    
    def add_metadata(self):
        """Add processing metadata"""
        df = self.standardized_df
        df['processing_timestamp'] = datetime.now()
        df['source_file'] = 'Raw_GL_Export.csv'
        df['data_quality_score'] = 100 - (len(self.anomaly_log) / len(df) * 100) if len(df) > 0 else 100
        df['anomaly_count'] = df.apply(lambda row: len([a for a in self.anomaly_log 
                                                          if a.get('transaction_id') == row['transaction_id']]), axis=1)
        return self
    
    def save_output(self):
        """Save standardized data and anomaly log"""
        os.makedirs(Config.OUTPUT_PATH, exist_ok=True)
        os.makedirs(Config.REPORTS_PATH, exist_ok=True)
        
        # Save standardized data
        output_cols = ['transaction_id', 'posting_date_raw', 'posting_date', 'invoice_date_raw',
                       'invoice_date', 'fiscal_period', 'fiscal_year', 'fiscal_month',
                       'entity_code', 'account_code_raw', 'cost_center_raw', 'vendor_name_raw',
                       'invoice_number', 'po_number', 'currency_code', 'amount_raw', 'amount',
                       'amount_is_negative', 'tax_code', 'narrative', 'source_system',
                       'processing_timestamp', 'data_quality_score', 'anomaly_count']
        
        # Only include columns that exist
        available_cols = [col for col in output_cols if col in self.standardized_df.columns]
        self.standardized_df[available_cols].to_csv(
            f"{Config.OUTPUT_PATH}GL_Standardized.csv", index=False
        )
        
        # Save anomaly log
        if self.anomaly_log:
            pd.DataFrame(self.anomaly_log).to_csv(
                f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv", index=False
            )
        
        print(f"   üíæ Saved {len(self.standardized_df)} rows to {Config.OUTPUT_PATH}GL_Standardized.csv")
        print(f"   üíæ Saved {len(self.anomaly_log)} anomalies to {Config.REPORTS_PATH}Input_Anomalies_Detected.csv")
        
        return self.standardized_df, self.anomaly_log
    
    def run(self, filepath):
        """Execute all T001 steps"""
        print("\n" + "="*60)
        print("üöÄ T001: Wrangling Raw GL Data")
        print("="*60)
        
        self.load_raw_data(filepath)
        self.standardize_column_names()
        self.standardize_dates()
        self.clean_amounts()
        self.detect_embedded_exceptions()
        self.add_metadata()
        df, anomalies = self.save_output()
        
        print(f"\n‚úÖ T001 Complete. Processed {len(df)} rows, found {len(anomalies)} anomalies.")
        return df, anomalies


# ============================================================================
# T002: MAP ENTITIES AND ACCOUNTS
# ============================================================================

class T002_EntityAccountMapper:
    """Task 2: Resolve entity codes and account codes against master data"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.entity_master = None
        self.account_master = None
        self.cost_center_master = None
        self.mapping_anomalies = []
        
    def load_master_data(self):
        """Load master reference files"""
        print("\nüìÇ T002: Loading master data...")
        
        try:
            self.entity_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Entity.csv")
            print(f"   Loaded {len(self.entity_master)} entities")
        except:
            print("   ‚ö†Ô∏è Entity master not found, creating default")
            self.entity_master = pd.DataFrame({'entity_code': ['AUS01']})
        
        try:
            self.account_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_COA.csv")
            print(f"   Loaded {len(self.account_master)} accounts")
        except:
            print("   ‚ö†Ô∏è Account master not found, creating default")
            self.account_master = pd.DataFrame({'account_code': [f"{i:04d}" for i in range(5000, 5029)]})
        
        try:
            self.cost_center_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv")
            print(f"   Loaded {len(self.cost_center_master)} cost centers")
        except:
            print("   ‚ö†Ô∏è Cost center master not found")
            self.cost_center_master = pd.DataFrame({'cost_center': ['CC' + str(i).zfill(4) for i in range(1000, 1010)]})
        
        return self
    
    def map_entities(self):
        """Map entity codes against master"""
        valid_entities = self.entity_master['entity_code'].tolist() if 'entity_code' in self.entity_master.columns else ['AUS01']
        
        self.df['entity_valid'] = self.df['entity_code'].isin(valid_entities)
        self.df['entity_code_mapped'] = np.where(
            self.df['entity_valid'], 
            self.df['entity_code'], 
            None
        )
        
        for idx, row in self.df[~self.df['entity_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ENTITY',
                'severity': 'CRITICAL',
                'description': f"Entity code '{row['entity_code']}' not in master",
                'original_value': row['entity_code']
            })
        
        print(f"   ‚úì Entities mapped. Invalid: {(~self.df['entity_valid']).sum()}")
        return self
    
    def map_accounts(self):
        """Map account codes against master"""
        valid_accounts = self.account_master['account_code'].tolist() if 'account_code' in self.account_master.columns else []
        
        # Handle INVALID_ACCT specially
        self.df['account_valid'] = self.df['account_code_raw'].isin(valid_accounts)
        self.df['account_code_mapped'] = np.where(
            self.df['account_valid'],
            self.df['account_code_raw'],
            None
        )
        
        # Get account descriptions if available
        if 'account_description' in self.account_master.columns:
            account_desc_map = dict(zip(
                self.account_master['account_code'], 
                self.account_master['account_description']
            ))
            self.df['account_description'] = self.df['account_code_mapped'].map(account_desc_map)
        
        for idx, row in self.df[~self.df['account_valid']].iterrows():
            severity = 'CRITICAL' if row['account_code_raw'] == 'INVALID_ACCT' else 'HIGH'
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ACCOUNT',
                'severity': severity,
                'description': f"Account code '{row['account_code_raw']}' not in Chart of Accounts",
                'original_value': row['account_code_raw'],
                'amount': row['amount']
            })
        
        print(f"   ‚úì Accounts mapped. Invalid: {(~self.df['account_valid']).sum()}")
        return self
    
    def map_cost_centers(self):
        """Map cost centers against master"""
        valid_centers = self.cost_center_master['cost_center'].tolist() if 'cost_center' in self.cost_center_master.columns else []
        
        # Handle missing cost centers
        self.df['cost_center_present'] = self.df['cost_center_raw'].notna() & (self.df['cost_center_raw'] != '')
        self.df['cost_center_valid'] = self.df['cost_center_raw'].isin(valid_centers) if valid_centers else self.df['cost_center_present']
        self.df['cost_center_mapped'] = np.where(
            self.df['cost_center_valid'],
            self.df['cost_center_raw'],
            None
        )
        
        for idx, row in self.df[~self.df['cost_center_present']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'MISSING_COST_CENTER',
                'severity': 'MEDIUM',
                'description': "Cost center is missing",
                'amount': row['amount']
            })
        
        for idx, row in self.df[self.df['cost_center_present'] & ~self.df['cost_center_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_COST_CENTER',
                'severity': 'HIGH',
                'description': f"Cost center '{row['cost_center_raw']}' not in master",
                'original_value': row['cost_center_raw']
            })
        
        print(f"   ‚úì Cost centers mapped. Missing: {(~self.df['cost_center_present']).sum()}, Invalid: {(self.df['cost_center_present'] & ~self.df['cost_center_valid']).sum()}")
        return self
    
    def save_output(self):
        """Save mapped data"""
        # Update anomaly log with new anomalies
        existing_anomalies = pd.read_csv(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") if os.path.exists(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") else pd.DataFrame()
        
        all_anomalies = pd.concat([
            existing_anomalies, 
            pd.DataFrame(self.mapping_anomalies)
        ], ignore_index=True)
        
        all_anomalies.to_csv(f"{Config.REPORTS_PATH}Exceptions_Log.csv", index=False)
        
        # Save enriched data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithMappings.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_WithMappings.csv")
        print(f"   üíæ Updated exceptions log with {len(self.mapping_anomalies)} new anomalies")
        
        return self.df
    
    def run(self):
        """Execute all T002 steps"""
        print("\n" + "="*60)
        print("üöÄ T002: Mapping Entities and Accounts")
        print("="*60)
        
        self.load_master_data()
        self.map_entities()
        self.map_accounts()
        self.map_cost_centers()
        df = self.save_output()
        
        print(f"\n‚úÖ T002 Complete. Mapped {len(df)} transactions.")
        return df


# ============================================================================
# T003: RESOLVE VENDOR NAMES
# ============================================================================

class T003_VendorResolver:
    """Task 3: Map vendor aliases to canonical vendor names"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.vendor_master = None
        self.alias_map = None
        self.vendor_anomalies = []
        
    def load_vendor_data(self):
        """Load vendor master and alias mapping"""
        print("\nüìÇ T003: Loading vendor data...")
        
        try:
            self.vendor_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Vendors.csv")
            print(f"   Loaded {len(self.vendor_master)} canonical vendors")
        except:
            print("   ‚ö†Ô∏è Vendor master not found, creating default")
            self.vendor_master = pd.DataFrame({'canonical_vendor': ['Unknown']})
        
        try:
            self.alias_map = pd.read_csv(f"{Config.MASTER_DATA_PATH}Vendor_Alias_Map.csv")
            print(f"   Loaded {len(self.alias_map)} alias mappings")
        except:
            print("   ‚ö†Ô∏è Alias map not found")
            self.alias_map = pd.DataFrame({'alias': [], 'canonical_vendor': []})
        
        return self
    
    def build_alias_dict(self):
        """Create lookup dictionary from aliases to canonical names"""
        alias_dict = {}
        
        if self.alias_map is not None and len(self.alias_map) > 0:
            for _, row in self.alias_map.iterrows():
                alias_dict[row['alias'].strip().lower()] = row['canonical_vendor']
        
        # Add self-mappings for exact matches
        if self.vendor_master is not None and 'canonical_vendor' in self.vendor_master.columns:
            for vendor in self.vendor_master['canonical_vendor']:
                alias_dict[vendor.lower()] = vendor
        
        return alias_dict
    
    def resolve_vendors(self):
        """Apply vendor mapping"""
        alias_dict = self.build_alias_dict()
        canonical_list = self.vendor_master['canonical_vendor'].tolist() if 'canonical_vendor' in self.vendor_master.columns else []
        
        def resolve(vendor_raw):
            if pd.isna(vendor_raw) or vendor_raw == '':
                return None, 'MISSING'
            
            vendor_lower = str(vendor_raw).strip().lower()
            
            # Direct alias match
            if vendor_lower in alias_dict:
                return alias_dict[vendor_lower], 'MAPPED'
            
            # Check if it's already a canonical name
            if vendor_raw in canonical_list:
                return vendor_raw, 'CANONICAL'
            
            # Try partial matching (simple contains)
            for canonical in canonical_list:
                if canonical.lower() in vendor_lower or vendor_lower in canonical.lower():
                    return canonical, 'FUZZY_MATCHED'
            
            return None, 'UNMAPPED'
        
        # Apply resolution
        results = self.df['vendor_name_raw'].apply(resolve)
        self.df['vendor_canonical'] = [r[0] for r in results]
        self.df['vendor_resolution_status'] = [r[1] for r in results]
        
        # Log anomalies
        for idx, row in self.df.iterrows():
            if row['vendor_resolution_status'] == 'MISSING':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_VENDOR',
                    'severity': 'HIGH',
                    'description': 'Vendor name is missing',
                    'amount': row['amount']
                })
            elif row['vendor_resolution_status'] == 'UNMAPPED':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'UNMAPPED_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Vendor '{row['vendor_name_raw']}' not found in alias map",
                    'original_value': row['vendor_name_raw'],
                    'amount': row['amount']
                })
        
        mapped_count = self.df['vendor_resolution_status'].isin(['MAPPED', 'CANONICAL', 'FUZZY_MATCHED']).sum()
        unmapped_count = (self.df['vendor_resolution_status'] == 'UNMAPPED').sum()
        missing_count = (self.df['vendor_resolution_status'] == 'MISSING').sum()
        
        print(f"   ‚úì Vendors resolved. Mapped: {mapped_count}, Unmapped: {unmapped_count}, Missing: {missing_count}")
        return self
    
    def save_output(self):
        """Save vendor-resolved data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.vendor_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.vendor_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_VendorsResolved.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_VendorsResolved.csv")
        
        return self.df
    
    def run(self):
        """Execute all T003 steps"""
        print("\n" + "="*60)
        print("üöÄ T003: Resolving Vendor Names")
        print("="*60)
        
        self.load_vendor_data()
        self.resolve_vendors()
        df = self.save_output()
        
        print(f"\n‚úÖ T003 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T004: APPLY FX CONVERSION
# ============================================================================

class T004_FXConverter:
    """Task 4: Convert all transactions to AUD"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.fx_rates = None
        self.fx_anomalies = []
        
    def load_fx_rates(self):
        """Load foreign exchange rates"""
        print("\nüìÇ T004: Loading FX rates...")
        
        try:
            self.fx_rates = pd.read_csv(f"{Config.REFERENCE_PATH}FX_Rates.csv")
            print(f"   Loaded {len(self.fx_rates)} FX rates")
            
            # Ensure period is string for joining
            self.fx_rates['period'] = self.fx_rates['period'].astype(str)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è FX rates not found: {e}")
            # Create default rates (1.0 for all)
            periods = self.df['fiscal_period'].unique()
            currencies = self.df['currency_code'].unique()
            
            rates_data = []
            for period in periods:
                for currency in currencies:
                    if currency == 'AUD':
                        rate = 1.0
                    elif currency == 'USD':
                        rate = 1.5
                    elif currency == 'GBP':
                        rate = 1.9
                    elif currency == 'NZD':
                        rate = 0.95
                    elif currency == 'EUR':
                        rate = 1.6
                    else:
                        rate = None
                    
                    rates_data.append({
                        'period': period,
                        'currency': currency,
                        'rate': rate
                    })
            
            self.fx_rates = pd.DataFrame(rates_data)
            print(f"   Created default rates for {len(self.fx_rates)} currency-period combinations")
        
        return self
    
    def convert_to_aud(self):
        """Convert amounts to AUD"""
        
        # Create lookup key
        self.df['fx_key'] = self.df['fiscal_period'] + '_' + self.df['currency_code']
        self.fx_rates['fx_key'] = self.fx_rates['period'].astype(str) + '_' + self.fx_rates['currency']
        
        # Create rate lookup dictionary
        rate_dict = dict(zip(self.fx_rates['fx_key'], self.fx_rates['rate']))
        
        def get_rate(row):
            if row['currency_code'] == 'AUD':
                return 1.0
            
            key = row['fx_key']
            if key in rate_dict:
                return rate_dict[key]
            else:
                self.fx_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_FX_RATE',
                    'severity': 'CRITICAL',
                    'description': f"No FX rate found for {row['currency_code']} in period {row['fiscal_period']}",
                    'currency': row['currency_code'],
                    'period': row['fiscal_period'],
                    'amount': row['amount']
                })
                return None
        
        # Apply conversion
        self.df['fx_rate'] = self.df.apply(get_rate, axis=1)
        self.df['amount_aud'] = np.where(
            self.df['fx_rate'].notna(),
            self.df['amount'] * self.df['fx_rate'],
            None
        )
        
        # Flag conversion issues
        self.df['conversion_status'] = np.where(
            self.df['currency_code'] == 'AUD', 'DOMESTIC',
            np.where(self.df['fx_rate'].notna(), 'CONVERTED', 'FAILED')
        )
        
        converted = (self.df['conversion_status'] == 'CONVERTED').sum()
        failed = (self.df['conversion_status'] == 'FAILED').sum()
        domestic = (self.df['conversion_status'] == 'DOMESTIC').sum()
        
        print(f"   ‚úì FX conversion complete. Domestic: {domestic}, Converted: {converted}, Failed: {failed}")
        return self
    
    def save_output(self):
        """Save converted data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.fx_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.fx_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_Converted.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_Converted.csv")
        
        return self.df
    
    def run(self):
        """Execute all T004 steps"""
        print("\n" + "="*60)
        print("üöÄ T004: Applying FX Conversion")
        print("="*60)
        
        self.load_fx_rates()
        self.convert_to_aud()
        df = self.save_output()
        
        print(f"\n‚úÖ T004 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T005: DETECT EXCEPTIONS
# ============================================================================

class T005_ExceptionDetector:
    """Task 5: Run exception rules and flag violations"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.rulebook = None
        self.exception_results = []
        
    def load_rulebook(self):
        """Load exception rules"""
        print("\nüìÇ T005: Loading exception rulebook...")
        
        try:
            self.rulebook = pd.read_csv(f"{Config.REFERENCE_PATH}Exception_Rulebook.csv")
            print(f"   Loaded {len(self.rulebook)} exception rules")
            
            # Check if required columns exist, if not, create default rule IDs
            if 'rule_id' not in self.rulebook.columns:
                self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                print(f"   Added default rule_id column")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Rulebook not found or error loading: {e}")
            # Create default rules
            self.rulebook = pd.DataFrame([
                {'rule_id': 'EX001', 'rule_name': 'Missing PO Number', 
                 'severity': 'HIGH', 'logic': 'po_number is None or po_number == ""',
                 'description': 'Transaction has no purchase order number'},
                {'rule_id': 'EX002', 'rule_name': 'Missing Cost Center',
                 'severity': 'MEDIUM', 'logic': 'cost_center_mapped is None',
                 'description': 'Transaction has no cost center allocation'},
                {'rule_id': 'EX003', 'rule_name': 'Invalid Account',
                 'severity': 'CRITICAL', 'logic': 'account_code_mapped is None',
                 'description': 'Account code not in Chart of Accounts'},
                {'rule_id': 'EX004', 'rule_name': 'High Value Transaction',
                 'severity': 'MEDIUM', 'logic': f'amount_aud > {Config.HIGH_VALUE_THRESHOLD}',
                 'description': f'Transaction exceeds ${Config.HIGH_VALUE_THRESHOLD:,}'},
                {'rule_id': 'EX005', 'rule_name': 'Negative Amount',
                 'severity': 'MEDIUM', 'logic': 'amount_is_negative == True',
                 'description': 'Transaction has negative amount'},
                {'rule_id': 'EX006', 'rule_name': 'Unmapped Vendor',
                 'severity': 'HIGH', 'logic': 'vendor_resolution_status == "UNMAPPED"',
                 'description': 'Vendor not found in master data'},
                {'rule_id': 'EX007', 'rule_name': 'Future Dated Transaction',
                 'severity': 'HIGH', 'logic': 'posting_date > current_date and fiscal_period == current_period',
                 'description': 'Transaction date is in future but in current period'},
                {'rule_id': 'EX008', 'rule_name': 'Invalid Date',
                 'severity': 'CRITICAL', 'logic': 'posting_date is None',
                 'description': 'Posting date is invalid or missing'},
                {'rule_id': 'EX009', 'rule_name': 'Missing Tax Code',
                 'severity': 'MEDIUM', 'logic': 'tax_code is None or tax_code == ""',
                 'description': 'Tax code is missing'},
                {'rule_id': 'EX010', 'rule_name': 'Extreme Outlier',
                 'severity': 'MEDIUM', 'logic': 'is_outlier == True',
                 'description': 'Amount is significantly outside normal range'},
            ])
            print(f"   Created {len(self.rulebook)} default exception rules")
        
        # Ensure all required columns exist
        required_cols = ['rule_id', 'rule_name', 'severity', 'description']
        for col in required_cols:
            if col not in self.rulebook.columns:
                if col == 'rule_id':
                    self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                elif col == 'rule_name':
                    self.rulebook['rule_name'] = [f'Rule {i+1}' for i in range(len(self.rulebook))]
                elif col == 'severity':
                    self.rulebook['severity'] = 'MEDIUM'
                elif col == 'description':
                    self.rulebook['description'] = self.rulebook.get('rule_name', 'No description')
        
        print(f"   Ready with {len(self.rulebook)} rules")
        return self
    
    def detect_outliers(self):
        """Statistical outlier detection"""
        # Group by account to find normal ranges
        account_stats = self.df.groupby('account_code_mapped')['amount_aud'].agg(['mean', 'std', 'count']).reset_index()
        account_stats.columns = ['account_code_mapped', 'mean_amount', 'std_amount', 'txn_count']
        
        # Merge stats back
        self.df = self.df.merge(account_stats, on='account_code_mapped', how='left')
        
        # Flag outliers (beyond 3 standard deviations)
        self.df['is_outlier'] = np.where(
            (self.df['std_amount'] > 0) & 
            (self.df['amount_aud'].notna()) &
            (abs(self.df['amount_aud'] - self.df['mean_amount']) > Config.EXTREME_OUTLIER_MULTIPLIER * self.df['std_amount']),
            True,
            False
        )
        
        print(f"   ‚úì Outlier detection complete. Found {self.df['is_outlier'].sum()} outliers")
        return self
    
    def detect_temporal_anomalies(self):
        """Detect unusual timing patterns"""
        # Extract hour from posting date if available
        self.df['posting_hour'] = self.df['posting_date'].dt.hour
        self.df['posting_day'] = self.df['posting_date'].dt.day_name()
        self.df['posting_weekend'] = self.df['posting_date'].dt.dayofweek.isin([5, 6])
        
        # Flag suspicious hours (late night/early morning)
        self.df['suspicious_hour'] = (
            self.df['posting_hour'].notna() & 
            ((self.df['posting_hour'] >= Config.SUSPICIOUS_HOUR_START) | 
             (self.df['posting_hour'] <= Config.SUSPICIOUS_HOUR_END))
        )
        
        return self
    
    def apply_rules(self):
        """Apply all exception rules"""
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)  # Approx month end
        
        # Create a dictionary of rule logic functions
        rule_functions = {
            'EX001': lambda row: pd.isna(row['po_number']) or row['po_number'] == '',
            'EX002': lambda row: pd.isna(row['cost_center_mapped']),
            'EX003': lambda row: pd.isna(row['account_code_mapped']),
            'EX004': lambda row: row['amount_aud'] > Config.HIGH_VALUE_THRESHOLD if pd.notna(row['amount_aud']) else False,
            'EX005': lambda row: row.get('amount_is_negative', False),
            'EX006': lambda row: row.get('vendor_resolution_status') == 'UNMAPPED',
            'EX007': lambda row: (pd.notna(row['posting_date']) and 
                                  row['posting_date'] > current_date and 
                                  row['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD),
            'EX008': lambda row: pd.isna(row['posting_date']),
            'EX009': lambda row: pd.isna(row['tax_code']) or row['tax_code'] == '',
            'EX010': lambda row: row.get('is_outlier', False),
        }
        
        for _, rule in self.rulebook.iterrows():
            rule_id = rule['rule_id']
            rule_name = rule.get('rule_name', f'Rule {rule_id}')
            severity = rule.get('severity', 'MEDIUM')
            description = rule.get('description', rule_name)
            
            # Get the rule function
            rule_func = rule_functions.get(rule_id)
            if rule_func is None:
                # Skip rules we don't have logic for
                continue
            
            # Apply rule
            for idx, row in self.df.iterrows():
                try:
                    if rule_func(row):
                        self.exception_results.append({
                            'transaction_id': row['transaction_id'],
                            'rule_id': rule_id,
                            'rule_name': rule_name,
                            'severity': severity,
                            'description': description,
                            'amount': row.get('amount_aud', 0),
                            'vendor': row.get('vendor_name_raw', ''),
                            'account': row.get('account_code_raw', '')
                        })
                except Exception as e:
                    # Log rule application error but continue
                    print(f"   ‚ö†Ô∏è Error applying rule {rule_id} to transaction {row['transaction_id']}: {e}")
                    continue
        
        # Also add any existing anomalies from previous steps
        for idx, row in self.df.iterrows():
            if row.get('amount_is_negative', False):
                # Check if already added by rule EX005
                exists = any(e['transaction_id'] == row['transaction_id'] and e['rule_id'] == 'EX005' 
                            for e in self.exception_results)
                if not exists:
                    self.exception_results.append({
                        'transaction_id': row['transaction_id'],
                        'rule_id': 'EX005',
                        'rule_name': 'Negative Amount',
                        'severity': 'MEDIUM',
                        'description': 'Transaction has negative amount',
                        'amount': row.get('amount_aud', 0),
                        'vendor': row.get('vendor_name_raw', ''),
                        'account': row.get('account_code_raw', '')
                    })
        
        print(f"   ‚úì Applied rules, found {len(self.exception_results)} exceptions")
        return self
    
    def save_output(self):
        """Save exception results"""
        # Add exception flags to dataframe
        exception_txns = [e['transaction_id'] for e in self.exception_results]
        self.df['has_exception'] = self.df['transaction_id'].isin(exception_txns)
        
        # Group exceptions by transaction
        exception_summary = {}
        for e in self.exception_results:
            txn = e['transaction_id']
            if txn not in exception_summary:
                exception_summary[txn] = []
            exception_summary[txn].append(e['rule_id'])
        
        self.df['exception_rules'] = self.df['transaction_id'].map(
            lambda x: ';'.join(exception_summary.get(x, []))
        )
        
        # Save data with flags
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithExceptions.csv", index=False)
        
        # Save exception log
        if self.exception_results:
            exceptions_df = pd.DataFrame(self.exception_results)
            exceptions_df.to_csv(f"{Config.REPORTS_PATH}Exceptions_Detailed.csv", index=False)
        
        # Update master exceptions log
        master_exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        
        # Convert new exceptions to simple format
        new_exceptions = []
        for e in self.exception_results:
            new_exceptions.append({
                'transaction_id': e['transaction_id'],
                'anomaly_type': e['rule_id'],
                'severity': e['severity'],
                'description': e['description'],
                'amount': e.get('amount', 0)
            })
        
        if os.path.exists(master_exceptions_path):
            existing = pd.read_csv(master_exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(new_exceptions)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(new_exceptions)
        
        all_exceptions.to_csv(master_exceptions_path, index=False)
        
        print(f"   üíæ Saved exception data")
        
        return self.df, self.exception_results
    
    def run(self):
        """Execute all T005 steps"""
        print("\n" + "="*60)
        print("üöÄ T005: Detecting Exceptions")
        print("="*60)
        
        self.load_rulebook()
        self.detect_outliers()
        self.detect_temporal_anomalies()
        self.apply_rules()
        df, exceptions = self.save_output()
        
        # Severity counts
        if exceptions:
            severity_counts = {}
            for e in exceptions:
                sev = e.get('severity', 'UNKNOWN')
                severity_counts[sev] = severity_counts.get(sev, 0) + 1
            
            print(f"\n‚úÖ T005 Complete. Exceptions by severity:")
            for severity, count in severity_counts.items():
                print(f"   {severity}: {count}")
        else:
            print(f"\n‚úÖ T005 Complete. No exceptions found.")
        
        return df, exceptions

# ============================================================================
# T006: REVIEW HIGH SEVERITY EXCEPTIONS (Automated version - no human review)
# ============================================================================

class T006_ExceptionReviewer:
    """Task 6: Review and categorize exceptions (automated)"""
    
    def __init__(self, df, exceptions):
        self.df = df.copy()
        self.exceptions = exceptions
        self.critical_exceptions = []
        self.high_exceptions = []
        
    def categorize_exceptions(self):
        """Split exceptions by severity"""
        for e in self.exceptions:
            if e['severity'] == 'CRITICAL':
                self.critical_exceptions.append(e)
            elif e['severity'] == 'HIGH':
                self.high_exceptions.append(e)
        
        print(f"\nüìä T006: Exception Summary")
        print(f"   Critical: {len(self.critical_exceptions)}")
        print(f"   High: {len(self.high_exceptions)}")
        print(f"   Medium/Low: {len(self.exceptions) - len(self.critical_exceptions) - len(self.high_exceptions)}")
        
        return self
    
    def create_review_package(self):
        """Create automated review summary (no human pause)"""
        
        # Group critical exceptions by type
        critical_summary = {}
        for e in self.critical_exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in critical_summary:
                critical_summary[e_type] = {'count': 0, 'total_amount': 0, 'examples': []}
            
            critical_summary[e_type]['count'] += 1
            critical_summary[e_type]['total_amount'] += e.get('amount', 0)
            
            if len(critical_summary[e_type]['examples']) < 3:
                critical_summary[e_type]['examples'].append({
                    'transaction_id': e['transaction_id'],
                    'amount': e.get('amount', 0),
                    'description': e.get('description', '')
                })
        
        # Save review summary
        review_data = {
            'timestamp': datetime.now(),
            'total_critical': len(self.critical_exceptions),
            'total_high': len(self.high_exceptions),
            'critical_summary': critical_summary,
            'auto_approved': True,
            'note': 'Automated processing - no human review required'
        }
        
        # Save to file
        import json
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.json", 'w') as f:
            json.dump(review_data, f, indent=2, default=str)
        
        # Create a simple text summary
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.txt", 'w') as f:
            f.write("EXCEPTION REVIEW SUMMARY (Automated)\n")
            f.write("="*50 + "\n\n")
            f.write(f"Review Date: {datetime.now()}\n")
            f.write(f"Status: AUTO-APPROVED\n\n")
            
            f.write(f"CRITICAL EXCEPTIONS: {len(self.critical_exceptions)}\n")
            for e_type, data in critical_summary.items():
                f.write(f"  ‚Ä¢ {e_type}: {data['count']} occurrences, ${data['total_amount']:,.2f}\n")
            
            f.write(f"\nHIGH EXCEPTIONS: {len(self.high_exceptions)}\n")
        
        print(f"   üíæ Saved review summary to {Config.REPORTS_PATH}Exception_Review_Summary.txt")
        
        return review_data
    
    def run(self):
        """Execute T006 steps"""
        print("\n" + "="*60)
        print("üöÄ T006: Reviewing High Severity Exceptions")
        print("="*60)
        print("   ‚ö° Automated mode - no human review required")
        
        self.categorize_exceptions()
        review_data = self.create_review_package()
        
        print(f"\n‚úÖ T006 Complete. Proceeding with pipeline.")
        
        return self.df, review_data


# ============================================================================
# T007: COMPUTE BUDGET VARIANCE
# ============================================================================

class T007_BudgetVariance:
    """Task 7: Calculate actual vs budget variance"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.budget_data = None
        self.variance_results = {}
        
    def load_budget(self):
        """Load budget data"""
        print("\nüìÇ T007: Loading budget data...")
        
        try:
            self.budget_data = pd.read_csv(f"{Config.BUDGET_PATH}Budget_2026.csv")
            print(f"   Loaded budget data with {len(self.budget_data)} rows")
            
            # Standardize column names
            self.budget_data.columns = [col.lower().strip() for col in self.budget_data.columns]
            
            # Check what columns we have
            print(f"   Budget columns: {list(self.budget_data.columns)}")
            
            # Try to identify period column
            period_col = None
            for col in ['period', 'month', 'fiscal_period', 'fiscal_month', 'reporting_period']:
                if col in self.budget_data.columns:
                    period_col = col
                    break
            
            if period_col:
                # Rename to standard 'period' for consistency
                self.budget_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                # If no period column, assume all rows are for current period
                print(f"   ‚ö†Ô∏è No period column found, assuming all rows are for {Config.CURRENT_FISCAL_PERIOD}")
                self.budget_data['period'] = Config.CURRENT_FISCAL_PERIOD
            
            # Try to identify account column
            account_col = None
            for col in ['account', 'account_code', 'account_id', 'gl_account', 'coa']:
                if col in self.budget_data.columns:
                    account_col = col
                    break
            
            if account_col:
                self.budget_data.rename(columns={account_col: 'account_code'}, inplace=True)
                print(f"   Using '{account_col}' as account column")
            else:
                # If no account column, create dummy account codes
                print(f"   ‚ö†Ô∏è No account column found, creating default account codes")
                self.budget_data['account_code'] = [f"{i:04d}" for i in range(5000, 5000 + len(self.budget_data))]
            
            # Try to identify budget amount column
            budget_col = None
            for col in ['budget', 'budget_amount', 'amount', 'budget_amt', 'planned']:
                if col in self.budget_data.columns:
                    budget_col = col
                    break
            
            if budget_col:
                self.budget_data.rename(columns={budget_col: 'budget_amount'}, inplace=True)
                print(f"   Using '{budget_col}' as budget amount column")
            else:
                # If no budget column, create sample data
                print(f"   ‚ö†Ô∏è No budget amount column found, creating sample data")
                self.budget_data['budget_amount'] = np.random.randint(50000, 200000, size=len(self.budget_data))
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Budget data not found or error loading: {e}")
            # Create sample budget
            accounts = [f"{i:04d}" for i in range(5000, 5029)]
            
            budget_rows = []
            for account in accounts:
                budget_rows.append({
                    'account_code': account,
                    'period': Config.CURRENT_FISCAL_PERIOD,
                    'budget_amount': np.random.randint(50000, 200000)
                })
            
            self.budget_data = pd.DataFrame(budget_rows)
            print(f"   Created sample budget for {len(self.budget_data)} accounts")
        
        # Ensure period is string
        self.budget_data['period'] = self.budget_data['period'].astype(str)
        
        return self
    
    def calculate_variance(self):
        """Calculate variance by account, cost center, and overall"""
        
        # Filter to current period only
        current_period_df = self.df[
            (self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
            (self.df['amount_aud'].notna())
        ].copy()
        
        print(f"   Processing {len(current_period_df)} transactions for {Config.CURRENT_FISCAL_PERIOD}")
        
        # 1. Variance by Account (valid accounts only)
        account_actuals = current_period_df.groupby('account_code_mapped').agg({
            'amount_aud': 'sum',
            'transaction_id': 'count'
        }).rename(columns={
            'amount_aud': 'actual_amount',
            'transaction_id': 'transaction_count'
        }).reset_index()
        
        # Filter to valid accounts
        account_actuals = account_actuals[account_actuals['account_code_mapped'].notna()]
        
        # Get budget for current period
        feb_budget = self.budget_data[self.budget_data['period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        if feb_budget.empty:
            print(f"   ‚ö†Ô∏è No budget found for period {Config.CURRENT_FISCAL_PERIOD}, using all budget data")
            feb_budget = self.budget_data.copy()
        
        # Merge with budget
        account_variance = account_actuals.merge(
            feb_budget[['account_code', 'budget_amount']],
            left_on='account_code_mapped',
            right_on='account_code',
            how='outer'
        )
        
        account_variance['budget_amount'] = account_variance['budget_amount'].fillna(0)
        account_variance['actual_amount'] = account_variance['actual_amount'].fillna(0)
        account_variance['variance'] = account_variance['actual_amount'] - account_variance['budget_amount']
        account_variance['variance_pct'] = np.where(
            account_variance['budget_amount'] > 0,
            (account_variance['variance'] / account_variance['budget_amount']) * 100,
            np.nan
        )
        
        # 2. Variance by Cost Center
        cc_actuals = current_period_df.groupby('cost_center_mapped').agg({
            'amount_aud': 'sum',
            'transaction_id': 'count'
        }).rename(columns={
            'amount_aud': 'actual_amount',
            'transaction_id': 'transaction_count'
        }).reset_index()
        
        cc_actuals = cc_actuals[cc_actuals['cost_center_mapped'].notna()]
        
        # 3. Suspense amounts (invalid accounts)
        suspense_amount = current_period_df[
            current_period_df['account_code_mapped'].isna()
        ]['amount_aud'].sum()
        
        # 4. Future dated amounts
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)
        future_amount = current_period_df[
            current_period_df['posting_date'] > current_date
        ]['amount_aud'].sum()
        
        # 5. Total actual and budget
        total_actual = current_period_df['amount_aud'].sum()
        total_budget = feb_budget['budget_amount'].sum() if not feb_budget.empty else 0
        total_variance = total_actual - total_budget
        total_variance_pct = (total_variance / total_budget * 100) if total_budget > 0 else np.nan
        
        # Store results
        self.variance_results = {
            'by_account': account_variance.to_dict('records'),
            'by_cost_center': cc_actuals.to_dict('records'),
            'suspense_amount': suspense_amount,
            'future_dated_amount': future_amount,
            'total_actual': total_actual,
            'total_budget': total_budget,
            'total_variance': total_variance,
            'total_variance_pct': total_variance_pct,
            'transaction_count': len(current_period_df),
            'exception_count': current_period_df['has_exception'].sum() if 'has_exception' in current_period_df.columns else 0
        }
        
        print(f"\n   Variance Summary:")
        print(f"   Total Actual: ${total_actual:,.2f}")
        print(f"   Total Budget: ${total_budget:,.2f}")
        print(f"   Variance: ${total_variance:,.2f} ({total_variance_pct:.1f}%)")
        print(f"   Suspense (invalid accounts): ${suspense_amount:,.2f}")
        print(f"   Future dated: ${future_amount:,.2f}")
        
        return self
    
    def save_output(self):
        """Save variance results"""
        
        # Save detailed variance by account
        if self.variance_results['by_account']:
            pd.DataFrame(self.variance_results['by_account']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_Account.csv", index=False
            )
        
        # Save variance by cost center
        if self.variance_results['by_cost_center']:
            pd.DataFrame(self.variance_results['by_cost_center']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_CostCenter.csv", index=False
            )
        
        # Save summary
        summary_df = pd.DataFrame([{
            'metric': 'Total Actual',
            'value': self.variance_results['total_actual']
        }, {
            'metric': 'Total Budget',
            'value': self.variance_results['total_budget']
        }, {
            'metric': 'Variance',
            'value': self.variance_results['total_variance']
        }, {
            'metric': 'Variance %',
            'value': self.variance_results['total_variance_pct']
        }, {
            'metric': 'Suspense Amount',
            'value': self.variance_results['suspense_amount']
        }, {
            'metric': 'Future Dated Amount',
            'value': self.variance_results['future_dated_amount']
        }, {
            'metric': 'Transaction Count',
            'value': self.variance_results['transaction_count']
        }, {
            'metric': 'Exception Count',
            'value': self.variance_results['exception_count']
        }])
        
        summary_df.to_csv(f"{Config.REPORTS_PATH}Budget_Variance_Summary.csv", index=False)
        
        print(f"   üíæ Saved variance reports to {Config.REPORTS_PATH}")
        
        return self.variance_results
    
    def run(self):
        """Execute T007 steps"""
        print("\n" + "="*60)
        print("üöÄ T007: Computing Budget Variance")
        print("="*60)
        
        self.load_budget()
        self.calculate_variance()
        results = self.save_output()
        
        print(f"\n‚úÖ T007 Complete.")
        
        return results


# ============================================================================
# T008: GENERATE CLOSE PACK REPORT
# ============================================================================

class T008_ClosePackReport:
    """Task 8: Create comprehensive month-end close report"""
    
    def __init__(self, df, variance_results, exceptions):
        self.df = df.copy()
        self.variance = variance_results
        self.exceptions = exceptions
        self.report_data = {}
        
    def generate_report(self):
        """Generate comprehensive close pack"""
        print("\nüìù T008: Generating Close Pack Report")
        
        # Filter to current period
        current_df = self.df[self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        # 1. Executive Summary
        self.report_data['executive_summary'] = {
            'period': Config.CURRENT_FISCAL_PERIOD,
            'generated_date': datetime.now(),
            'total_transactions': len(current_df),
            'total_spend': self.variance.get('total_actual', 0),
            'total_budget': self.variance.get('total_budget', 0),
            'variance': self.variance.get('total_variance', 0),
            'variance_pct': self.variance.get('total_variance_pct', 0),
            'exception_count': len(self.exceptions),
            'critical_exception_count': len([e for e in self.exceptions if e.get('severity') == 'CRITICAL']),
            'data_quality_score': current_df['data_quality_score'].iloc[0] if 'data_quality_score' in current_df.columns and len(current_df) > 0 else 85
        }
        
        # 2. Top exceptions
        exception_counts = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_counts:
                exception_counts[e_type] = {'count': 0, 'total_amount': 0}
            exception_counts[e_type]['count'] += 1
            exception_counts[e_type]['total_amount'] += e.get('amount', 0)
        
        self.report_data['top_exceptions'] = sorted(
            [{'type': k, **v} for k, v in exception_counts.items()],
            key=lambda x: x['total_amount'],
            reverse=True
        )[:10]
        
        # 3. Top vendors by spend - check if vendor_canonical exists
        if 'vendor_canonical' in current_df.columns:
            vendor_spend = current_df.groupby('vendor_canonical').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
        else:
            # Fallback to vendor_name_raw
            vendor_spend = current_df.groupby('vendor_name_raw').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
            vendor_spend.rename(columns={'vendor_name_raw': 'vendor_canonical'}, inplace=True)
        
        self.report_data['top_vendors'] = vendor_spend.to_dict('records')
        
        # 4. Account summary - FIX: Check if account_description exists
        if 'account_description' in current_df.columns:
            account_summary = current_df.groupby(['account_code_mapped', 'account_description']).agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            # Group by account code only
            account_summary = current_df.groupby('account_code_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
            # Add placeholder description
            account_summary['account_description'] = 'Unknown'
        
        self.report_data['account_summary'] = account_summary.to_dict('records')
        
        # 5. Cost center summary
        if 'cost_center_mapped' in current_df.columns:
            cc_summary = current_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            cc_summary = pd.DataFrame(columns=['cost_center_mapped', 'amount_aud', 'transaction_id'])
        
        self.report_data['cost_center_summary'] = cc_summary.to_dict('records')
        
        # 6. Currency exposure
        if 'currency_code' in current_df.columns and 'amount_aud' in current_df.columns:
            currency_summary = current_df.groupby('currency_code').agg({
                'amount': 'sum',
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index()
        else:
            currency_summary = pd.DataFrame(columns=['currency_code', 'amount', 'amount_aud', 'transaction_id'])
        
        self.report_data['currency_summary'] = currency_summary.to_dict('records')
        
        # 7. Source system breakdown
        if 'source_system' in current_df.columns:
            source_summary = current_df.groupby('source_system').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            source_summary = pd.DataFrame(columns=['source_system', 'amount_aud', 'transaction_id'])
        
        self.report_data['source_summary'] = source_summary.to_dict('records')
        
        print(f"   Generated report with {len(self.report_data)} sections")
        return self
    
    def save_report(self):
        """Save report in multiple formats"""
        
        # Save as CSV (tabular)
        pd.DataFrame([self.report_data['executive_summary']]).to_csv(
            f"{Config.REPORTS_PATH}Close_Pack_Executive_Summary.csv", index=False
        )
        
        if self.report_data['top_vendors']:
            pd.DataFrame(self.report_data['top_vendors']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Top_Vendors.csv", index=False
            )
        
        if self.report_data['account_summary']:
            pd.DataFrame(self.report_data['account_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Account_Summary.csv", index=False
            )
        
        if self.report_data['cost_center_summary']:
            pd.DataFrame(self.report_data['cost_center_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Cost_Center_Summary.csv", index=False
            )
        
        if self.report_data['currency_summary']:
            pd.DataFrame(self.report_data['currency_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Currency_Summary.csv", index=False
            )
        
        if self.report_data.get('source_summary'):
            pd.DataFrame(self.report_data['source_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Source_Summary.csv", index=False
            )
        
        # Save as text report
        with open(f"{Config.REPORTS_PATH}MonthEnd_Close_Pack_Feb2026.txt", 'w') as f:
            f.write("="*80 + "\n")
            f.write(f"MONTH-END CLOSE PACK - {Config.CURRENT_FISCAL_PERIOD}\n")
            f.write("="*80 + "\n\n")
            
            # Executive Summary
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-"*40 + "\n")
            f.write(f"Period: {self.report_data['executive_summary']['period']}\n")
            f.write(f"Generated: {self.report_data['executive_summary']['generated_date']}\n")
            f.write(f"Total Transactions: {self.report_data['executive_summary']['total_transactions']:,}\n")
            f.write(f"Total Spend: ${self.report_data['executive_summary']['total_spend']:,.2f}\n")
            f.write(f"Total Budget: ${self.report_data['executive_summary']['total_budget']:,.2f}\n")
            f.write(f"Variance: ${self.report_data['executive_summary']['variance']:,.2f} ")
            f.write(f"({self.report_data['executive_summary']['variance_pct']:.1f}%)\n")
            f.write(f"Data Quality Score: {self.report_data['executive_summary']['data_quality_score']:.1f}/100\n\n")
            
            # Top Exceptions
            f.write("TOP EXCEPTIONS BY VALUE\n")
            f.write("-"*40 + "\n")
            for e in self.report_data['top_exceptions'][:5]:
                f.write(f"‚Ä¢ {e['type']}: {e['count']} occurrences, ${e['total_amount']:,.2f}\n")
            f.write("\n")
            
            # Top Vendors
            f.write("TOP 10 VENDORS\n")
            f.write("-"*40 + "\n")
            for v in self.report_data['top_vendors'][:10]:
                vendor_name = v.get('vendor_canonical', v.get('vendor_name_raw', 'Unknown'))
                f.write(f"‚Ä¢ {vendor_name}: ${v['amount_aud']:,.2f} ({v['transaction_id']} txns)\n")
            f.write("\n")
            
            # Currency Exposure
            f.write("CURRENCY EXPOSURE\n")
            f.write("-"*40 + "\n")
            for c in self.report_data['currency_summary']:
                f.write(f"‚Ä¢ {c['currency_code']}: {c['transaction_id']} txns, ")
                f.write(f"Original: ${c.get('amount', 0):,.2f}, AUD: ${c['amount_aud']:,.2f}\n")
            
            # Source Systems
            if self.report_data.get('source_summary'):
                f.write("\nSOURCE SYSTEMS\n")
                f.write("-"*40 + "\n")
                for s in self.report_data['source_summary'][:5]:
                    f.write(f"‚Ä¢ {s['source_system']}: ${s['amount_aud']:,.2f} ({s['transaction_id']} txns)\n")
        
        print(f"   üíæ Saved reports to {Config.REPORTS_PATH}")
        
        return self.report_data
    
    def run(self):
        """Execute T008 steps"""
        print("\n" + "="*60)
        print("üöÄ T008: Generating Close Pack Report")
        print("="*60)
        
        self.generate_report()
        report = self.save_report()
        
        print(f"\n‚úÖ T008 Complete. Report saved.")
        
        return report


# ============================================================================
# T009: GENERATE EXECUTIVE NARRATIVE (Rule-based, no LLM)
# ============================================================================

class T009_ExecutiveNarrative:
    """Task 9: Create natural language summary (rule-based, no LLM)"""
    
    def __init__(self, variance_results, report_data, exceptions):
        self.variance = variance_results
        self.report = report_data
        self.exceptions = exceptions
        self.narrative = ""
        
    def generate_narrative(self):
        """Generate narrative using templates and rules"""
        print("\nüìù T009: Generating Executive Narrative")
        
        lines = []
        
        # Header
        lines.append("="*80)
        lines.append(f"EXECUTIVE NARRATIVE - {Config.CURRENT_FISCAL_PERIOD}")
        lines.append("="*80)
        lines.append("")
        
        # Financial Summary
        lines.append("FINANCIAL SUMMARY")
        lines.append("-"*40)
        
        variance_pct = self.variance['total_variance_pct']
        if abs(variance_pct) < 2:
            variance_desc = "in line with"
        elif variance_pct > 0:
            if variance_pct > 10:
                variance_desc = "significantly above"
            else:
                variance_desc = "moderately above"
        else:
            if variance_pct < -10:
                variance_desc = "significantly below"
            else:
                variance_desc = "moderately below"
        
        lines.append(f"Total spend for {Config.CURRENT_FISCAL_PERIOD} was ${self.variance['total_actual']:,.2f}, "
                    f"which is {variance_desc} budget of ${self.variance['total_budget']:,.2f}. "
                    f"The variance is ${abs(self.variance['total_variance']):,.2f} ({variance_pct:.1f}%).")
        lines.append("")
        
        # Key Drivers
        lines.append("KEY VARIANCE DRIVERS")
        lines.append("-"*40)
        
        # Find largest variances from account data
        account_variances = self.variance['by_account']
        top_pos = sorted([a for a in account_variances if a.get('variance', 0) > 0], 
                         key=lambda x: x['variance'], reverse=True)[:3]
        top_neg = sorted([a for a in account_variances if a.get('variance', 0) < 0], 
                         key=lambda x: x['variance'])[:3]
        
        if top_pos:
            lines.append("Positive variances (over budget):")
            for a in top_pos:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: +${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        
        if top_neg:
            lines.append("Negative variances (under budget):")
            for a in top_neg:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: ${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        lines.append("")
        
        # Exception Summary
        lines.append("EXCEPTION SUMMARY")
        lines.append("-"*40)
        
        critical_count = len([e for e in self.exceptions if e.get('severity') == 'CRITICAL'])
        high_count = len([e for e in self.exceptions if e.get('severity') == 'HIGH'])
        medium_count = len([e for e in self.exceptions if e.get('severity') == 'MEDIUM'])
        
        lines.append(f"Total exceptions: {len(self.exceptions)}")
        lines.append(f"  ‚Ä¢ Critical: {critical_count}")
        lines.append(f"  ‚Ä¢ High: {high_count}")
        lines.append(f"  ‚Ä¢ Medium: {medium_count}")
        
        # Top exception types
        exception_types = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_types:
                exception_types[e_type] = 0
            exception_types[e_type] += 1
        
        top_types = sorted(exception_types.items(), key=lambda x: x[1], reverse=True)[:3]
        if top_types:
            lines.append("\nMost common exceptions:")
            for e_type, count in top_types:
                lines.append(f"  ‚Ä¢ {e_type}: {count} occurrences")
        lines.append("")
        
        # Data Quality Impact
        lines.append("DATA QUALITY IMPACT")
        lines.append("-"*40)
        
        suspense_amount = self.variance.get('suspense_amount', 0)
        future_amount = self.variance.get('future_dated_amount', 0)
        total_impact = suspense_amount + future_amount
        impact_pct = (total_impact / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Transactions with data quality issues: ${total_impact:,.2f} ({impact_pct:.1f}% of total)")
        if suspense_amount > 0:
            lines.append(f"  ‚Ä¢ Invalid accounts (in suspense): ${suspense_amount:,.2f}")
        if future_amount > 0:
            lines.append(f"  ‚Ä¢ Future-dated transactions: ${future_amount:,.2f}")
        lines.append("")
        
        # Currency Impact
        lines.append("CURRENCY EXPOSURE")
        lines.append("-"*40)
        
        non_aud_total = sum(c['amount_aud'] for c in self.report['currency_summary'] 
                           if c['currency_code'] != 'AUD')
        non_aud_pct = (non_aud_total / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Foreign currency exposure: ${non_aud_total:,.2f} ({non_aud_pct:.1f}% of total)")
        
        # Top non-AUD currencies
        for c in self.report['currency_summary']:
            if c['currency_code'] != 'AUD' and c['amount_aud'] > 0:
                lines.append(f"  ‚Ä¢ {c['currency_code']}: ${c['amount_aud']:,.2f}")
        lines.append("")
        
        # Recommendations
        lines.append("RECOMMENDATIONS")
        lines.append("-"*40)
        
        if suspense_amount > 10000:
            lines.append("‚Ä¢ Review and remap transactions with invalid account codes")
        if future_amount > 10000:
            lines.append("‚Ä¢ Reclassify future-dated transactions to correct period")
        if critical_count > 0:
            lines.append("‚Ä¢ Investigate critical exceptions before next close")
        if len(self.exceptions) > 100:
            lines.append("‚Ä¢ Schedule data quality workshop to address root causes")
        
        # Join all lines
        self.narrative = "\n".join(lines)
        
        print(f"   Generated {len(lines)} lines of narrative")
        return self
    
    def save_narrative(self):
        """Save narrative to file"""
        with open(f"{Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt", 'w') as f:
            f.write(self.narrative)
        
        print(f"   üíæ Saved narrative to {Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt")
        
        return self.narrative
    
    def run(self):
        """Execute T009 steps"""
        print("\n" + "="*60)
        print("üöÄ T009: Generating Executive Narrative")
        print("="*60)
        
        self.generate_narrative()
        narrative = self.save_narrative()
        
        print(f"\n‚úÖ T009 Complete.")
        
        return narrative


# ============================================================================
# T010: FORECAST NEXT PERIOD
# ============================================================================

class T010_Forecast:
    """Task 10: Generate forecast for next period based on historical trends"""
    
    def __init__(self, df, variance_results):
        self.df = df
        self.variance = variance_results
        self.historical_data = None
        self.forecast = {}
        
    def load_historical(self):
        """Load historical KPI data"""
        print("\nüìÇ T010: Loading historical data...")
        
        try:
            self.historical_data = pd.read_csv(f"{Config.REFERENCE_PATH}KPI_Monthly_History.csv")
            print(f"   Loaded {len(self.historical_data)} months of historical data")
        except Exception as e:
            print(f"   ‚ö†Ô∏è Historical data not found: {e}")
            # Create synthetic history from current data
            months = []
            for i in range(1, 13):
                month = f"2025-{i:02d}" if i <= 12 else f"2026-{i-12:02d}"
                months.append({
                    'period': month,
                    'total_spend': self.variance['total_actual'] * (0.8 + 0.4 * np.random.random()),
                    'transaction_count': int(self.variance['transaction_count'] * (0.8 + 0.4 * np.random.random()))
                })
            self.historical_data = pd.DataFrame(months)
            print(f"   Created synthetic historical data for {len(self.historical_data)} months")
        
        return self
    
    def calculate_trends(self):
        """Calculate trends from historical data"""
        
        # Sort by period
        self.historical_data = self.historical_data.sort_values('period')
        
        # Calculate moving averages
        if len(self.historical_data) >= 3:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend'].rolling(3).mean()
        else:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend']
        
        # Calculate growth rate
        if len(self.historical_data) >= 2:
            self.historical_data['growth_rate'] = self.historical_data['total_spend'].pct_change()
            avg_growth = self.historical_data['growth_rate'].mean()
        else:
            avg_growth = 0.02  # Default 2% growth
        
        # Recent trend (last 3 months)
        recent_data = self.historical_data.tail(3)
        recent_avg = recent_data['total_spend'].mean()
        recent_growth = recent_data['growth_rate'].mean() if len(recent_data) >= 2 else avg_growth
        
        # Seasonal adjustment (if we have same month last year)
        last_year_data = self.historical_data[
            self.historical_data['period'].str.endswith(f"{Config.CURRENT_MONTH:02d}")
        ]
        
        if not last_year_data.empty:
            seasonal_factor = last_year_data['total_spend'].iloc[0] / recent_avg
        else:
            seasonal_factor = 1.0
        
        # Calculate forecast for next period
        next_period = f"{Config.CURRENT_YEAR}-{Config.CURRENT_MONTH+1:02d}" if Config.CURRENT_MONTH < 12 else f"{Config.CURRENT_YEAR+1}-01"
        
        # Base forecast on recent average with growth and seasonal adjustment
        base_forecast = recent_avg * (1 + recent_growth) * seasonal_factor
        
        # Adjust based on current month actual
        current_actual = self.variance['total_actual']
        current_ratio = current_actual / recent_avg if recent_avg > 0 else 1.0
        
        # Blend current and historical (70% recent trend, 30% current month)
        blended_forecast = 0.7 * base_forecast + 0.3 * current_actual * 1.05  # Assume 5% growth
        
        # Calculate confidence interval
        std_dev = self.historical_data['total_spend'].std() if len(self.historical_data) > 1 else blended_forecast * 0.1
        lower_bound = blended_forecast - 1.96 * std_dev / np.sqrt(len(self.historical_data))
        upper_bound = blended_forecast + 1.96 * std_dev / np.sqrt(len(self.historical_data))
        
        self.forecast = {
            'next_period': next_period,
            'forecast_amount': blended_forecast,
            'lower_bound': max(0, lower_bound),
            'upper_bound': upper_bound,
            'confidence_level': 0.95,
            'method': 'Blended (70% trend, 30% current)',
            'historical_months_used': len(self.historical_data),
            'avg_growth_rate': avg_growth,
            'seasonal_factor': seasonal_factor,
            'current_actual': current_actual,
            'recent_avg': recent_avg
        }
        
        print(f"\n   Forecast for {next_period}:")
        print(f"   Point forecast: ${self.forecast['forecast_amount']:,.2f}")
        print(f"   95% CI: (${self.forecast['lower_bound']:,.2f} - ${self.forecast['upper_bound']:,.2f})")
        
        return self
    
    def save_forecast(self):
        """Save forecast results"""
        
        # Save as CSV
        forecast_df = pd.DataFrame([self.forecast])
        forecast_df.to_csv(f"{Config.REPORTS_PATH}Forecast_Mar2026.csv", index=False)
        
        # Save detailed forecast with account-level breakdown
        # (Simplified - just allocate based on current proportions)
        if 'by_account' in self.variance and self.variance['by_account']:
            account_proportions = []
            for a in self.variance['by_account']:
                if a.get('actual_amount', 0) > 0:
                    proportion = a['actual_amount'] / self.variance['total_actual']
                    account_proportions.append({
                        'account_code': a.get('account_code_mapped', 'UNKNOWN'),
                        'current_actual': a['actual_amount'],
                        'forecast_proportion': proportion,
                        'forecast_amount': proportion * self.forecast['forecast_amount']
                    })
            
            pd.DataFrame(account_proportions).to_csv(
                f"{Config.REPORTS_PATH}Forecast_By_Account.csv", index=False
            )
        
        print(f"   üíæ Saved forecast to {Config.REPORTS_PATH}Forecast_Mar2026.csv")
        
        return self.forecast
    
    def run(self):
        """Execute T010 steps"""
        print("\n" + "="*60)
        print("üöÄ T010: Forecasting Next Period")
        print("="*60)
        
        self.load_historical()
        self.calculate_trends()
        forecast = self.save_forecast()
        
        print(f"\n‚úÖ T010 Complete.")
        
        return forecast


# ============================================================================
# MAIN PIPELINE EXECUTION
# ============================================================================

class FinancialCloseAgent:
    """Main agent orchestrating all tasks"""
    
    def __init__(self):
        self.results = {}
        self.start_time = datetime.now()
        
    def run_pipeline(self):
        """Execute all tasks in sequence"""
        print("\n" + "="*80)
        print("üöÄ FINANCIAL CLOSE AGENT PIPELINE")
        print(f"   Started: {self.start_time}")
        print("="*80 + "\n")
        
        # Task 001: Wrangle Raw Data
        wrangler = T001_DataWrangler()
        df, anomalies = wrangler.run(Config.RAW_DATA_PATH)
        self.results['df_t001'] = df
        self.results['anomalies'] = anomalies
        
        # Task 002: Map Entities and Accounts
        mapper = T002_EntityAccountMapper(df)
        df = mapper.run()
        self.results['df_t002'] = df
        
        # Task 003: Resolve Vendors
        resolver = T003_VendorResolver(df)
        df = resolver.run()
        self.results['df_t003'] = df
        
        # Task 004: FX Conversion
        converter = T004_FXConverter(df)
        df = converter.run()
        self.results['df_t004'] = df
        
        # Task 005: Detect Exceptions
        detector = T005_ExceptionDetector(df)
        df, exceptions = detector.run()
        self.results['df_t005'] = df
        self.results['exceptions'] = exceptions
        
        # Task 006: Review Exceptions (Automated)
        reviewer = T006_ExceptionReviewer(df, exceptions)
        df, review = reviewer.run()
        self.results['df_t006'] = df
        self.results['review'] = review
        
        # Task 007: Budget Variance
        variance = T007_BudgetVariance(df)
        variance_results = variance.run()
        self.results['variance'] = variance_results
        
        # Task 008: Close Pack Report
        report = T008_ClosePackReport(df, variance_results, exceptions)
        report_data = report.run()
        self.results['report'] = report_data
        
        # Task 009: Executive Narrative
        narrative = T009_ExecutiveNarrative(variance_results, report_data, exceptions)
        narrative_text = narrative.run()
        self.results['narrative'] = narrative_text
        
        # Task 010: Forecast
        forecast = T010_Forecast(df, variance_results)
        forecast_data = forecast.run()
        self.results['forecast'] = forecast_data
        
        # Completion
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()
        
        print("\n" + "="*80)
        print("‚úÖ PIPELINE COMPLETE")
        print(f"   Finished: {end_time}")
        print(f"   Duration: {duration:.2f} seconds")
        print("="*80)
        
        return self.results


# ============================================================================
# EXECUTE THE PIPELINE
# ============================================================================

if __name__ == "__main__":
    # Create directories if they don't exist
    for path in [Config.OUTPUT_PATH, Config.REPORTS_PATH]:
        os.makedirs(path, exist_ok=True)
    
    # Run the agent
    agent = FinancialCloseAgent()
    results = agent.run_pipeline()
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä FINAL SUMMARY")
    print("="*80)
    print(f"Total transactions processed: {len(results['df_t001'])}")
    print(f"Total exceptions found: {len(results['exceptions'])}")
    print(f"Critical exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'CRITICAL'])}")
    print(f"High exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'HIGH'])}")
    print(f"Total spend: ${results['variance']['total_actual']:,.2f}")
    print(f"Budget variance: ${results['variance']['total_variance']:,.2f} ({results['variance']['total_variance_pct']:.1f}%)")
    print(f"Suspense amount (invalid accounts): ${results['variance']['suspense_amount']:,.2f}")
    print(f"Forecast for next period: ${results['forecast']['forecast_amount']:,.2f}")
    print("\nOutput files saved to:")
    print(f"  ‚Ä¢ Working data: {Config.OUTPUT_PATH}")
    print(f"  ‚Ä¢ Reports: {Config.REPORTS_PATH}")
    print("="*80)


üöÄ FINANCIAL CLOSE AGENT PIPELINE
   Started: 2026-02-20 22:41:42.667251


üöÄ T001: Wrangling Raw GL Data
üìÇ T001: Loading raw GL data...
   Loaded 4080 rows
   ‚úì Column names standardized
   ‚úì Dates standardized. Invalid dates: 48
   ‚úì Amounts cleaned. Negative amounts: 96
   ‚úì Embedded exceptions detected: 0
   üíæ Saved 4080 rows to working/GL_Standardized.csv
   üíæ Saved 656 anomalies to reports/Input_Anomalies_Detected.csv

‚úÖ T001 Complete. Processed 4080 rows, found 656 anomalies.

üöÄ T002: Mapping Entities and Accounts

üìÇ T002: Loading master data...
   Loaded 1 entities
   Loaded 28 accounts
   Loaded 10 cost centers
   ‚úì Entities mapped. Invalid: 0
   ‚úì Accounts mapped. Invalid: 4080
   ‚úì Cost centers mapped. Missing: 200, Invalid: 0
   üíæ Saved to working/GL_WithMappings.csv
   üíæ Updated exceptions log with 4280 new anomalies

‚úÖ T002 Complete. Mapped 4080 transactions.

üöÄ T003: Resolving Vendor Names

üìÇ T003: Loading vendor data...


In [None]:
"""
Financial Close Agent - Complete Pipeline
Processes Raw GL Export through all 10 tasks without human intervention
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION AND SETUP
# ============================================================================

class Config:
    """Configuration settings for the agent"""
    RAW_DATA_PATH = "Raw_GL_Export.csv"
    MASTER_DATA_PATH = "Master_Data/"
    REFERENCE_PATH = "Reference/"
    BUDGET_PATH = "Budget/"
    OUTPUT_PATH = "working/"
    REPORTS_PATH = "reports/"
    
    # Fiscal period settings
    CURRENT_FISCAL_PERIOD = "2026-02"
    CURRENT_MONTH = 2
    CURRENT_YEAR = 2026
    
    # Anomaly thresholds
    HIGH_VALUE_THRESHOLD = 50000
    EXTREME_OUTLIER_MULTIPLIER = 5
    SUSPICIOUS_HOUR_START = 22
    SUSPICIOUS_HOUR_END = 6

# ============================================================================
# T001: WRANGLE RAW GL DATA
# ============================================================================

class T001_DataWrangler:
    """Task 1: Parse and standardize raw GL export data"""
    
    def __init__(self):
        self.raw_df = None
        self.standardized_df = None
        self.anomaly_log = []
        
    def load_raw_data(self, filepath):
        """Load raw CSV file"""
        print("üìÇ T001: Loading raw GL data...")
        self.raw_df = pd.read_csv(filepath)
        print(f"   Loaded {len(self.raw_df)} rows")
        return self
    
    def standardize_column_names(self):
        """Convert column names to snake_case"""
        column_mapping = {
            'Txn_ID': 'transaction_id',
            'Posting_Date_Raw': 'posting_date_raw',
            'Invoice_Date_Raw': 'invoice_date_raw',
            'Fiscal_Period': 'fiscal_period',
            'Entity': 'entity_code',
            'Account_Code_Raw': 'account_code_raw',
            'Cost_Center_Raw': 'cost_center_raw',
            'Vendor_Name_Raw': 'vendor_name_raw',
            'Invoice_Number': 'invoice_number',
            'PO_Number': 'po_number',
            'Currency': 'currency_code',
            'Amount': 'amount_raw',
            'Tax_Code': 'tax_code',
            'Narrative': 'narrative',
            'Source_System': 'source_system'
        }
        self.standardized_df = self.raw_df.rename(columns=column_mapping)
        print("   ‚úì Column names standardized")
        return self
    
    def standardize_dates(self):
        """Convert all dates to consistent format YYYY-MM-DD"""
        df = self.standardized_df
        
        def parse_date(date_str, txn_id, column_name):
            if pd.isna(date_str) or date_str in ['INVALID', '99/99/9999', '32/13/2026', '2026-13-45']:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_DATE',
                    'severity': 'CRITICAL',
                    'description': f"Invalid date value: {date_str}",
                    'column': column_name
                })
                return None
            
            # Try different date formats
            formats = [
                '%d-%m-%Y', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y',
                '%d/%m/%y', '%m/%d/%y', '%d-%m-%y', '%y-%m-%d'
            ]
            
            for fmt in formats:
                try:
                    return datetime.strptime(str(date_str), fmt)
                except:
                    continue
            
            # If all formats fail
            self.anomaly_log.append({
                'transaction_id': txn_id,
                'anomaly_type': 'UNPARSABLE_DATE',
                'severity': 'CRITICAL',
                'description': f"Cannot parse date: {date_str}",
                'column': column_name
            })
            return None
        
        # Apply date parsing with transaction_id
        df['posting_date'] = df.apply(
            lambda row: parse_date(row['posting_date_raw'], row['transaction_id'], 'posting_date_raw'), 
            axis=1
        )
        df['invoice_date'] = df.apply(
            lambda row: parse_date(row['invoice_date_raw'], row['transaction_id'], 'invoice_date_raw'), 
            axis=1
        )
        
        # Extract fiscal year and month
        df['fiscal_year'] = df['fiscal_period'].str[:4]
        df['fiscal_month'] = df['fiscal_period'].str[-2:]
        
        # Check fiscal period consistency
        for idx, row in df.iterrows():
            if pd.notna(row['posting_date']):
                posting_month = row['posting_date'].month
                fiscal_month = int(row['fiscal_month']) if pd.notna(row['fiscal_month']) else None
                
                if fiscal_month and posting_month != fiscal_month:
                    self.anomaly_log.append({
                        'transaction_id': row['transaction_id'],
                        'anomaly_type': 'FISCAL_PERIOD_MISMATCH',
                        'severity': 'HIGH',
                        'description': f"Posting date month ({posting_month}) != fiscal period month ({fiscal_month})",
                        'posting_date': row['posting_date'],
                        'fiscal_period': row['fiscal_period']
                    })
        
        print(f"   ‚úì Dates standardized. Invalid dates: {sum(df['posting_date'].isna())}")
        return self
    
    def clean_amounts(self):
        """Convert amount strings to floats"""
        df = self.standardized_df
        
        def parse_amount(amt_str, txn_id):
            if pd.isna(amt_str):
                return None
            
            # Remove currency symbols, commas, spaces
            cleaned = str(amt_str).replace('$', '').replace(',', '').strip()
            
            # Handle negative numbers in parentheses
            if cleaned.startswith('(') and cleaned.endswith(')'):
                cleaned = '-' + cleaned[1:-1]
            
            try:
                return float(cleaned)
            except:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_AMOUNT',
                    'severity': 'HIGH',
                    'description': f"Cannot parse amount: {amt_str}"
                })
                return None
        
        df['amount'] = df.apply(
            lambda row: parse_amount(row['amount_raw'], row['transaction_id']), 
            axis=1
        )
        
        # Flag negative amounts
        df['amount_is_negative'] = df['amount'] < 0
        for idx, row in df[df['amount_is_negative']].iterrows():
            self.anomaly_log.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'NEGATIVE_AMOUNT',
                'severity': 'MEDIUM',
                'description': f"Negative amount: {row['amount']}",
                'amount': row['amount']
            })
        
        print(f"   ‚úì Amounts cleaned. Negative amounts: {df['amount_is_negative'].sum()}")
        return self
    
    def detect_embedded_exceptions(self):
        """Look for obvious exceptions in raw data"""
        df = self.standardized_df
        keywords = ['error', 'flag', 'review', 'urgent', 'exception', 'invalid']
        
        df['narrative_lower'] = df['narrative'].str.lower().fillna('')
        
        for idx, row in df.iterrows():
            # Check narrative for keywords
            if any(keyword in str(row['narrative_lower']) for keyword in keywords):
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'NARRATIVE_SUGGESTS_EXCEPTION',
                    'severity': 'MEDIUM',
                    'description': f"Narrative contains exception keywords: {row['narrative']}",
                    'narrative': row['narrative']
                })
            
            # Check for placeholder vendor names
            if row['vendor_name_raw'] in ['Unlisted Company', 'Unknown Vendor LLC', 
                                           'New Vendor XYZ', 'Unregistered Supplier', 
                                           'Mystery Corp']:
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'PLACEHOLDER_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Placeholder vendor name: {row['vendor_name_raw']}",
                    'vendor': row['vendor_name_raw']
                })
        
        print(f"   ‚úì Embedded exceptions detected: {len([a for a in self.anomaly_log if a['anomaly_type'] == 'NARRATIVE_SUGGESTS_EXCEPTION'])}")
        return self
    
    def add_metadata(self):
        """Add processing metadata"""
        df = self.standardized_df
        df['processing_timestamp'] = datetime.now()
        df['source_file'] = 'Raw_GL_Export.csv'
        df['data_quality_score'] = 100 - (len(self.anomaly_log) / len(df) * 100) if len(df) > 0 else 100
        df['anomaly_count'] = df.apply(lambda row: len([a for a in self.anomaly_log 
                                                          if a.get('transaction_id') == row['transaction_id']]), axis=1)
        return self
    
    def save_output(self):
        """Save standardized data and anomaly log"""
        os.makedirs(Config.OUTPUT_PATH, exist_ok=True)
        os.makedirs(Config.REPORTS_PATH, exist_ok=True)
        
        # Save standardized data
        output_cols = ['transaction_id', 'posting_date_raw', 'posting_date', 'invoice_date_raw',
                       'invoice_date', 'fiscal_period', 'fiscal_year', 'fiscal_month',
                       'entity_code', 'account_code_raw', 'cost_center_raw', 'vendor_name_raw',
                       'invoice_number', 'po_number', 'currency_code', 'amount_raw', 'amount',
                       'amount_is_negative', 'tax_code', 'narrative', 'source_system',
                       'processing_timestamp', 'data_quality_score', 'anomaly_count']
        
        # Only include columns that exist
        available_cols = [col for col in output_cols if col in self.standardized_df.columns]
        self.standardized_df[available_cols].to_csv(
            f"{Config.OUTPUT_PATH}GL_Standardized.csv", index=False
        )
        
        # Save anomaly log
        if self.anomaly_log:
            pd.DataFrame(self.anomaly_log).to_csv(
                f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv", index=False
            )
        
        print(f"   üíæ Saved {len(self.standardized_df)} rows to {Config.OUTPUT_PATH}GL_Standardized.csv")
        print(f"   üíæ Saved {len(self.anomaly_log)} anomalies to {Config.REPORTS_PATH}Input_Anomalies_Detected.csv")
        
        return self.standardized_df, self.anomaly_log
    
    def run(self, filepath):
        """Execute all T001 steps"""
        print("\n" + "="*60)
        print("üöÄ T001: Wrangling Raw GL Data")
        print("="*60)
        
        self.load_raw_data(filepath)
        self.standardize_column_names()
        self.standardize_dates()
        self.clean_amounts()
        self.detect_embedded_exceptions()
        self.add_metadata()
        df, anomalies = self.save_output()
        
        print(f"\n‚úÖ T001 Complete. Processed {len(df)} rows, found {len(anomalies)} anomalies.")
        return df, anomalies


# ============================================================================
# T002: MAP ENTITIES AND ACCOUNTS (FIXED FOR YOUR COLUMN NAMES)
# ============================================================================

class T002_EntityAccountMapper:
    """Task 2: Resolve entity codes and account codes against master data"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.entity_master = None
        self.account_master = None
        self.cost_center_master = None
        self.mapping_anomalies = []
        
    def load_master_data(self):
        """Load master reference files"""
        print("\nüìÇ T002: Loading master data...")
        
        try:
            self.entity_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Entity.csv")
            print(f"   Loaded {len(self.entity_master)} entities")
            print(f"   Entity columns: {list(self.entity_master.columns)}")
        except:
            print("   ‚ö†Ô∏è Entity master not found, creating default")
            self.entity_master = pd.DataFrame({'entity_code': ['AUS01']})
        
        try:
            self.account_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_COA.csv")
            print(f"   Loaded {len(self.account_master)} accounts")
            print(f"   Account columns: {list(self.account_master.columns)}")
            
            # Standardize column names - convert to lowercase for easier matching
            self.account_master.columns = [col.lower().strip() for col in self.account_master.columns]
            
            # Map the account code column (which might be 'account_code' or 'account_code' after lowercasing)
            if 'account_code' not in self.account_master.columns:
                # Check for alternative names
                if 'account_code' in self.account_master.columns:
                    self.account_master.rename(columns={'account_code': 'account_code'}, inplace=True)
                elif 'account' in self.account_master.columns:
                    self.account_master.rename(columns={'account': 'account_code'}, inplace=True)
                elif 'code' in self.account_master.columns:
                    self.account_master.rename(columns={'code': 'account_code'}, inplace=True)
                else:
                    print(f"   ‚ö†Ô∏è Could not find account code column. Using first column as account_code")
                    first_col = self.account_master.columns[0]
                    self.account_master.rename(columns={first_col: 'account_code'}, inplace=True)
            
            print(f"   Using '{self.account_master.columns[0]}' as account code column")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Account master not found or error: {e}")
            print("   Creating default account master")
            self.account_master = pd.DataFrame({'account_code': [f"{i:04d}" for i in range(5000, 5029)]})
        
        try:
            self.cost_center_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv")
            print(f"   Loaded {len(self.cost_center_master)} cost centers")
            print(f"   Cost center columns: {list(self.cost_center_master.columns)}")
            
            # Standardize cost center column
            self.cost_center_master.columns = [col.lower().strip() for col in self.cost_center_master.columns]
            
            if 'cost_center' not in self.cost_center_master.columns:
                if 'costcenter' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'costcenter': 'cost_center'}, inplace=True)
                elif 'cc' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'cc': 'cost_center'}, inplace=True)
                else:
                    # Use first column as cost center
                    first_col = self.cost_center_master.columns[0]
                    self.cost_center_master.rename(columns={first_col: 'cost_center'}, inplace=True)
                    
        except Exception as e:
            print(f"   ‚ö†Ô∏è Cost center master not found or error: {e}")
            print("   Creating default cost center master")
            self.cost_center_master = pd.DataFrame({'cost_center': ['CC' + str(i).zfill(4) for i in range(1000, 1010)]})
        
        return self
    
    def map_entities(self):
        """Map entity codes against master"""
        # Handle entity master columns
        if 'entity_code' not in self.entity_master.columns:
            # Try to find entity code column
            for col in self.entity_master.columns:
                if 'entity' in col.lower() or 'code' in col.lower():
                    self.entity_master.rename(columns={col: 'entity_code'}, inplace=True)
                    break
        
        valid_entities = self.entity_master['entity_code'].tolist() if 'entity_code' in self.entity_master.columns else ['AUS01']
        
        self.df['entity_valid'] = self.df['entity_code'].isin(valid_entities)
        self.df['entity_code_mapped'] = np.where(
            self.df['entity_valid'], 
            self.df['entity_code'], 
            None
        )
        
        for idx, row in self.df[~self.df['entity_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ENTITY',
                'severity': 'CRITICAL',
                'description': f"Entity code '{row['entity_code']}' not in master",
                'original_value': row['entity_code']
            })
        
        print(f"   ‚úì Entities mapped. Invalid: {(~self.df['entity_valid']).sum()}")
        return self
    
    def map_accounts(self):
        """Map account codes against master with better matching"""
        
        # Get valid account codes from master
        if 'account_code' in self.account_master.columns:
            # Convert master account codes to strings and strip
            valid_accounts = [str(acct).strip() for acct in self.account_master['account_code'].tolist()]
            
            # Also try without leading/trailing spaces
            valid_accounts.extend([acct for acct in valid_accounts if acct != acct.strip()])
            valid_accounts = list(set(valid_accounts))  # Remove duplicates
            
            print(f"   Sample valid accounts: {valid_accounts[:5]}")
        else:
            print("   ‚ö†Ô∏è No account_code column found in master")
            valid_accounts = []
        
        # Clean raw account codes for comparison
        self.df['account_code_clean'] = self.df['account_code_raw'].astype(str).str.strip()
        
        # Try different matching strategies
        self.df['account_valid'] = False
        
        # Strategy 1: Direct match
        direct_match = self.df['account_code_raw'].isin(valid_accounts)
        self.df.loc[direct_match, 'account_valid'] = True
        
        # Strategy 2: Clean match
        clean_match = (~direct_match) & self.df['account_code_clean'].isin(valid_accounts)
        self.df.loc[clean_match, 'account_valid'] = True
        
        # Strategy 3: Numeric match (if both are numbers)
        if not self.df[~self.df['account_valid']].empty:
            # Convert valid accounts to numeric where possible
            numeric_valid = []
            for acct in valid_accounts:
                try:
                    numeric_valid.append(float(acct))
                except:
                    pass
            
            for idx, row in self.df[~self.df['account_valid']].iterrows():
                try:
                    raw_num = float(row['account_code_raw'])
                    if raw_num in numeric_valid:
                        self.df.at[idx, 'account_valid'] = True
                except:
                    pass
        
        # Assign mapped account codes
        def find_matching_account(row):
            if row['account_valid']:
                # Return the original if it's valid
                if row['account_code_raw'] in valid_accounts:
                    return row['account_code_raw']
                elif row['account_code_clean'] in valid_accounts:
                    return row['account_code_clean']
                else:
                    # Try to find numeric match
                    try:
                        raw_num = float(row['account_code_raw'])
                        for acct in valid_accounts:
                            try:
                                if float(acct) == raw_num:
                                    return acct
                            except:
                                continue
                    except:
                        pass
                    return row['account_code_raw']  # Return original if can't find better match
            return None
        
        self.df['account_code_mapped'] = self.df.apply(find_matching_account, axis=1)
        
        # Get account names/descriptions if available
        if 'account_name' in self.account_master.columns:
            # Create mapping dictionary
            account_desc_map = {}
            for _, row in self.account_master.iterrows():
                acct = str(row['account_code']).strip()
                desc = row['account_name']
                account_desc_map[acct] = desc
                # Also add without leading zeros
                if acct.isdigit():
                    account_desc_map[str(int(acct))] = desc
            
            self.df['account_description'] = self.df['account_code_mapped'].map(account_desc_map)
            print(f"   Added account descriptions")
        
        # Log anomalies for invalid accounts
        invalid_count = (~self.df['account_valid']).sum()
        for idx, row in self.df[~self.df['account_valid']].iterrows():
            severity = 'CRITICAL' if str(row['account_code_raw']) == 'INVALID_ACCT' else 'HIGH'
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ACCOUNT',
                'severity': severity,
                'description': f"Account code '{row['account_code_raw']}' not in Chart of Accounts",
                'original_value': row['account_code_raw'],
                'amount': row['amount']
            })
        
        print(f"   ‚úì Accounts mapped. Valid: {self.df['account_valid'].sum()}, Invalid: {invalid_count}")
        return self
    
    def map_cost_centers(self):
        """Map cost centers against master"""
        if 'cost_center' in self.cost_center_master.columns:
            valid_centers = self.cost_center_master['cost_center'].tolist()
        else:
            valid_centers = []
        
        # Handle missing cost centers
        self.df['cost_center_present'] = self.df['cost_center_raw'].notna() & (self.df['cost_center_raw'] != '')
        self.df['cost_center_valid'] = self.df['cost_center_raw'].isin(valid_centers) if valid_centers else self.df['cost_center_present']
        self.df['cost_center_mapped'] = np.where(
            self.df['cost_center_valid'],
            self.df['cost_center_raw'],
            None
        )
        
        for idx, row in self.df[~self.df['cost_center_present']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'MISSING_COST_CENTER',
                'severity': 'MEDIUM',
                'description': "Cost center is missing",
                'amount': row['amount']
            })
        
        for idx, row in self.df[self.df['cost_center_present'] & ~self.df['cost_center_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_COST_CENTER',
                'severity': 'HIGH',
                'description': f"Cost center '{row['cost_center_raw']}' not in master",
                'original_value': row['cost_center_raw']
            })
        
        print(f"   ‚úì Cost centers mapped. Missing: {(~self.df['cost_center_present']).sum()}, Invalid: {(self.df['cost_center_present'] & ~self.df['cost_center_valid']).sum()}")
        return self
    
    def save_output(self):
        """Save mapped data"""
        # Update anomaly log with new anomalies
        existing_anomalies = pd.read_csv(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") if os.path.exists(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") else pd.DataFrame()
        
        all_anomalies = pd.concat([
            existing_anomalies, 
            pd.DataFrame(self.mapping_anomalies)
        ], ignore_index=True)
        
        all_anomalies.to_csv(f"{Config.REPORTS_PATH}Exceptions_Log.csv", index=False)
        
        # Save enriched data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithMappings.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_WithMappings.csv")
        print(f"   üíæ Updated exceptions log with {len(self.mapping_anomalies)} new anomalies")
        
        return self.df
    
    def run(self):
        """Execute all T002 steps"""
        print("\n" + "="*60)
        print("üöÄ T002: Mapping Entities and Accounts")
        print("="*60)
        
        self.load_master_data()
        self.map_entities()
        self.map_accounts()
        self.map_cost_centers()
        df = self.save_output()
        
        print(f"\n‚úÖ T002 Complete. Mapped {len(df)} transactions.")
        return df


# ============================================================================
# T003: RESOLVE VENDOR NAMES
# ============================================================================

class T003_VendorResolver:
    """Task 3: Map vendor aliases to canonical vendor names"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.vendor_master = None
        self.alias_map = None
        self.vendor_anomalies = []
        
    def load_vendor_data(self):
        """Load vendor master and alias mapping"""
        print("\nüìÇ T003: Loading vendor data...")
        
        try:
            self.vendor_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Vendors.csv")
            print(f"   Loaded {len(self.vendor_master)} canonical vendors")
        except:
            print("   ‚ö†Ô∏è Vendor master not found, creating default")
            self.vendor_master = pd.DataFrame({'canonical_vendor': ['Unknown']})
        
        try:
            self.alias_map = pd.read_csv(f"{Config.MASTER_DATA_PATH}Vendor_Alias_Map.csv")
            print(f"   Loaded {len(self.alias_map)} alias mappings")
        except:
            print("   ‚ö†Ô∏è Alias map not found")
            self.alias_map = pd.DataFrame({'alias': [], 'canonical_vendor': []})
        
        return self
    
    def build_alias_dict(self):
        """Create lookup dictionary from aliases to canonical names"""
        alias_dict = {}
        
        if self.alias_map is not None and len(self.alias_map) > 0:
            for _, row in self.alias_map.iterrows():
                alias_dict[row['alias'].strip().lower()] = row['canonical_vendor']
        
        # Add self-mappings for exact matches
        if self.vendor_master is not None and 'canonical_vendor' in self.vendor_master.columns:
            for vendor in self.vendor_master['canonical_vendor']:
                alias_dict[vendor.lower()] = vendor
        
        return alias_dict
    
    def resolve_vendors(self):
        """Apply vendor mapping"""
        alias_dict = self.build_alias_dict()
        canonical_list = self.vendor_master['canonical_vendor'].tolist() if 'canonical_vendor' in self.vendor_master.columns else []
        
        def resolve(vendor_raw):
            if pd.isna(vendor_raw) or vendor_raw == '':
                return None, 'MISSING'
            
            vendor_lower = str(vendor_raw).strip().lower()
            
            # Direct alias match
            if vendor_lower in alias_dict:
                return alias_dict[vendor_lower], 'MAPPED'
            
            # Check if it's already a canonical name
            if vendor_raw in canonical_list:
                return vendor_raw, 'CANONICAL'
            
            # Try partial matching (simple contains)
            for canonical in canonical_list:
                if canonical.lower() in vendor_lower or vendor_lower in canonical.lower():
                    return canonical, 'FUZZY_MATCHED'
            
            return None, 'UNMAPPED'
        
        # Apply resolution
        results = self.df['vendor_name_raw'].apply(resolve)
        self.df['vendor_canonical'] = [r[0] for r in results]
        self.df['vendor_resolution_status'] = [r[1] for r in results]
        
        # Log anomalies
        for idx, row in self.df.iterrows():
            if row['vendor_resolution_status'] == 'MISSING':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_VENDOR',
                    'severity': 'HIGH',
                    'description': 'Vendor name is missing',
                    'amount': row['amount']
                })
            elif row['vendor_resolution_status'] == 'UNMAPPED':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'UNMAPPED_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Vendor '{row['vendor_name_raw']}' not found in alias map",
                    'original_value': row['vendor_name_raw'],
                    'amount': row['amount']
                })
        
        mapped_count = self.df['vendor_resolution_status'].isin(['MAPPED', 'CANONICAL', 'FUZZY_MATCHED']).sum()
        unmapped_count = (self.df['vendor_resolution_status'] == 'UNMAPPED').sum()
        missing_count = (self.df['vendor_resolution_status'] == 'MISSING').sum()
        
        print(f"   ‚úì Vendors resolved. Mapped: {mapped_count}, Unmapped: {unmapped_count}, Missing: {missing_count}")
        return self
    
    def save_output(self):
        """Save vendor-resolved data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.vendor_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.vendor_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_VendorsResolved.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_VendorsResolved.csv")
        
        return self.df
    
    def run(self):
        """Execute all T003 steps"""
        print("\n" + "="*60)
        print("üöÄ T003: Resolving Vendor Names")
        print("="*60)
        
        self.load_vendor_data()
        self.resolve_vendors()
        df = self.save_output()
        
        print(f"\n‚úÖ T003 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T004: APPLY FX CONVERSION
# ============================================================================

class T004_FXConverter:
    """Task 4: Convert all transactions to AUD"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.fx_rates = None
        self.fx_anomalies = []
        
    def load_fx_rates(self):
        """Load foreign exchange rates"""
        print("\nüìÇ T004: Loading FX rates...")
        
        try:
            self.fx_rates = pd.read_csv(f"{Config.REFERENCE_PATH}FX_Rates.csv")
            print(f"   Loaded {len(self.fx_rates)} FX rates")
            
            # Ensure period is string for joining
            self.fx_rates['period'] = self.fx_rates['period'].astype(str)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è FX rates not found: {e}")
            # Create default rates (1.0 for all)
            periods = self.df['fiscal_period'].unique()
            currencies = self.df['currency_code'].unique()
            
            rates_data = []
            for period in periods:
                for currency in currencies:
                    if currency == 'AUD':
                        rate = 1.0
                    elif currency == 'USD':
                        rate = 1.5
                    elif currency == 'GBP':
                        rate = 1.9
                    elif currency == 'NZD':
                        rate = 0.95
                    elif currency == 'EUR':
                        rate = 1.6
                    else:
                        rate = None
                    
                    rates_data.append({
                        'period': period,
                        'currency': currency,
                        'rate': rate
                    })
            
            self.fx_rates = pd.DataFrame(rates_data)
            print(f"   Created default rates for {len(self.fx_rates)} currency-period combinations")
        
        return self
    
    def convert_to_aud(self):
        """Convert amounts to AUD"""
        
        # Create lookup key
        self.df['fx_key'] = self.df['fiscal_period'] + '_' + self.df['currency_code']
        self.fx_rates['fx_key'] = self.fx_rates['period'].astype(str) + '_' + self.fx_rates['currency']
        
        # Create rate lookup dictionary
        rate_dict = dict(zip(self.fx_rates['fx_key'], self.fx_rates['rate']))
        
        def get_rate(row):
            if row['currency_code'] == 'AUD':
                return 1.0
            
            key = row['fx_key']
            if key in rate_dict:
                return rate_dict[key]
            else:
                self.fx_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_FX_RATE',
                    'severity': 'CRITICAL',
                    'description': f"No FX rate found for {row['currency_code']} in period {row['fiscal_period']}",
                    'currency': row['currency_code'],
                    'period': row['fiscal_period'],
                    'amount': row['amount']
                })
                return None
        
        # Apply conversion
        self.df['fx_rate'] = self.df.apply(get_rate, axis=1)
        self.df['amount_aud'] = np.where(
            self.df['fx_rate'].notna(),
            self.df['amount'] * self.df['fx_rate'],
            None
        )
        
        # Flag conversion issues
        self.df['conversion_status'] = np.where(
            self.df['currency_code'] == 'AUD', 'DOMESTIC',
            np.where(self.df['fx_rate'].notna(), 'CONVERTED', 'FAILED')
        )
        
        converted = (self.df['conversion_status'] == 'CONVERTED').sum()
        failed = (self.df['conversion_status'] == 'FAILED').sum()
        domestic = (self.df['conversion_status'] == 'DOMESTIC').sum()
        
        print(f"   ‚úì FX conversion complete. Domestic: {domestic}, Converted: {converted}, Failed: {failed}")
        return self
    
    def save_output(self):
        """Save converted data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.fx_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.fx_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_Converted.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_Converted.csv")
        
        return self.df
    
    def run(self):
        """Execute all T004 steps"""
        print("\n" + "="*60)
        print("üöÄ T004: Applying FX Conversion")
        print("="*60)
        
        self.load_fx_rates()
        self.convert_to_aud()
        df = self.save_output()
        
        print(f"\n‚úÖ T004 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T005: DETECT EXCEPTIONS
# ============================================================================

class T005_ExceptionDetector:
    """Task 5: Run exception rules and flag violations"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.rulebook = None
        self.exception_results = []
        
    def load_rulebook(self):
        """Load exception rules"""
        print("\nüìÇ T005: Loading exception rulebook...")
        
        try:
            self.rulebook = pd.read_csv(f"{Config.REFERENCE_PATH}Exception_Rulebook.csv")
            print(f"   Loaded {len(self.rulebook)} exception rules")
            
            # Check if required columns exist, if not, create default rule IDs
            if 'rule_id' not in self.rulebook.columns:
                self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                print(f"   Added default rule_id column")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Rulebook not found or error loading: {e}")
            # Create default rules
            self.rulebook = pd.DataFrame([
                {'rule_id': 'EX001', 'rule_name': 'Missing PO Number', 
                 'severity': 'HIGH', 'logic': 'po_number is None or po_number == ""',
                 'description': 'Transaction has no purchase order number'},
                {'rule_id': 'EX002', 'rule_name': 'Missing Cost Center',
                 'severity': 'MEDIUM', 'logic': 'cost_center_mapped is None',
                 'description': 'Transaction has no cost center allocation'},
                {'rule_id': 'EX003', 'rule_name': 'Invalid Account',
                 'severity': 'CRITICAL', 'logic': 'account_code_mapped is None',
                 'description': 'Account code not in Chart of Accounts'},
                {'rule_id': 'EX004', 'rule_name': 'High Value Transaction',
                 'severity': 'MEDIUM', 'logic': f'amount_aud > {Config.HIGH_VALUE_THRESHOLD}',
                 'description': f'Transaction exceeds ${Config.HIGH_VALUE_THRESHOLD:,}'},
                {'rule_id': 'EX005', 'rule_name': 'Negative Amount',
                 'severity': 'MEDIUM', 'logic': 'amount_is_negative == True',
                 'description': 'Transaction has negative amount'},
                {'rule_id': 'EX006', 'rule_name': 'Unmapped Vendor',
                 'severity': 'HIGH', 'logic': 'vendor_resolution_status == "UNMAPPED"',
                 'description': 'Vendor not found in master data'},
                {'rule_id': 'EX007', 'rule_name': 'Future Dated Transaction',
                 'severity': 'HIGH', 'logic': 'posting_date > current_date and fiscal_period == current_period',
                 'description': 'Transaction date is in future but in current period'},
                {'rule_id': 'EX008', 'rule_name': 'Invalid Date',
                 'severity': 'CRITICAL', 'logic': 'posting_date is None',
                 'description': 'Posting date is invalid or missing'},
                {'rule_id': 'EX009', 'rule_name': 'Missing Tax Code',
                 'severity': 'MEDIUM', 'logic': 'tax_code is None or tax_code == ""',
                 'description': 'Tax code is missing'},
                {'rule_id': 'EX010', 'rule_name': 'Extreme Outlier',
                 'severity': 'MEDIUM', 'logic': 'is_outlier == True',
                 'description': 'Amount is significantly outside normal range'},
            ])
            print(f"   Created {len(self.rulebook)} default exception rules")
        
        # Ensure all required columns exist
        required_cols = ['rule_id', 'rule_name', 'severity', 'description']
        for col in required_cols:
            if col not in self.rulebook.columns:
                if col == 'rule_id':
                    self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                elif col == 'rule_name':
                    self.rulebook['rule_name'] = [f'Rule {i+1}' for i in range(len(self.rulebook))]
                elif col == 'severity':
                    self.rulebook['severity'] = 'MEDIUM'
                elif col == 'description':
                    self.rulebook['description'] = self.rulebook.get('rule_name', 'No description')
        
        print(f"   Ready with {len(self.rulebook)} rules")
        return self
    
    def detect_outliers(self):
        """Statistical outlier detection"""
        # Group by account to find normal ranges
        account_stats = self.df.groupby('account_code_mapped')['amount_aud'].agg(['mean', 'std', 'count']).reset_index()
        account_stats.columns = ['account_code_mapped', 'mean_amount', 'std_amount', 'txn_count']
        
        # Merge stats back
        self.df = self.df.merge(account_stats, on='account_code_mapped', how='left')
        
        # Flag outliers (beyond 3 standard deviations)
        self.df['is_outlier'] = np.where(
            (self.df['std_amount'] > 0) & 
            (self.df['amount_aud'].notna()) &
            (abs(self.df['amount_aud'] - self.df['mean_amount']) > Config.EXTREME_OUTLIER_MULTIPLIER * self.df['std_amount']),
            True,
            False
        )
        
        print(f"   ‚úì Outlier detection complete. Found {self.df['is_outlier'].sum()} outliers")
        return self
    
    def detect_temporal_anomalies(self):
        """Detect unusual timing patterns"""
        # Extract hour from posting date if available
        self.df['posting_hour'] = self.df['posting_date'].dt.hour
        self.df['posting_day'] = self.df['posting_date'].dt.day_name()
        self.df['posting_weekend'] = self.df['posting_date'].dt.dayofweek.isin([5, 6])
        
        # Flag suspicious hours (late night/early morning)
        self.df['suspicious_hour'] = (
            self.df['posting_hour'].notna() & 
            ((self.df['posting_hour'] >= Config.SUSPICIOUS_HOUR_START) | 
             (self.df['posting_hour'] <= Config.SUSPICIOUS_HOUR_END))
        )
        
        return self
    
    def apply_rules(self):
        """Apply all exception rules"""
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)  # Approx month end
        
        # Create a dictionary of rule logic functions
        rule_functions = {
            'EX001': lambda row: pd.isna(row['po_number']) or row['po_number'] == '',
            'EX002': lambda row: pd.isna(row['cost_center_mapped']),
            'EX003': lambda row: pd.isna(row['account_code_mapped']),
            'EX004': lambda row: row['amount_aud'] > Config.HIGH_VALUE_THRESHOLD if pd.notna(row['amount_aud']) else False,
            'EX005': lambda row: row.get('amount_is_negative', False),
            'EX006': lambda row: row.get('vendor_resolution_status') == 'UNMAPPED',
            'EX007': lambda row: (pd.notna(row['posting_date']) and 
                                  row['posting_date'] > current_date and 
                                  row['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD),
            'EX008': lambda row: pd.isna(row['posting_date']),
            'EX009': lambda row: pd.isna(row['tax_code']) or row['tax_code'] == '',
            'EX010': lambda row: row.get('is_outlier', False),
        }
        
        for _, rule in self.rulebook.iterrows():
            rule_id = rule['rule_id']
            rule_name = rule.get('rule_name', f'Rule {rule_id}')
            severity = rule.get('severity', 'MEDIUM')
            description = rule.get('description', rule_name)
            
            # Get the rule function
            rule_func = rule_functions.get(rule_id)
            if rule_func is None:
                # Skip rules we don't have logic for
                continue
            
            # Apply rule
            for idx, row in self.df.iterrows():
                try:
                    if rule_func(row):
                        self.exception_results.append({
                            'transaction_id': row['transaction_id'],
                            'rule_id': rule_id,
                            'rule_name': rule_name,
                            'severity': severity,
                            'description': description,
                            'amount': row.get('amount_aud', 0),
                            'vendor': row.get('vendor_name_raw', ''),
                            'account': row.get('account_code_raw', '')
                        })
                except Exception as e:
                    # Log rule application error but continue
                    print(f"   ‚ö†Ô∏è Error applying rule {rule_id} to transaction {row['transaction_id']}: {e}")
                    continue
        
        # Also add any existing anomalies from previous steps
        for idx, row in self.df.iterrows():
            if row.get('amount_is_negative', False):
                # Check if already added by rule EX005
                exists = any(e['transaction_id'] == row['transaction_id'] and e['rule_id'] == 'EX005' 
                            for e in self.exception_results)
                if not exists:
                    self.exception_results.append({
                        'transaction_id': row['transaction_id'],
                        'rule_id': 'EX005',
                        'rule_name': 'Negative Amount',
                        'severity': 'MEDIUM',
                        'description': 'Transaction has negative amount',
                        'amount': row.get('amount_aud', 0),
                        'vendor': row.get('vendor_name_raw', ''),
                        'account': row.get('account_code_raw', '')
                    })
        
        print(f"   ‚úì Applied rules, found {len(self.exception_results)} exceptions")
        return self
    
    def save_output(self):
        """Save exception results"""
        # Add exception flags to dataframe
        exception_txns = [e['transaction_id'] for e in self.exception_results]
        self.df['has_exception'] = self.df['transaction_id'].isin(exception_txns)
        
        # Group exceptions by transaction
        exception_summary = {}
        for e in self.exception_results:
            txn = e['transaction_id']
            if txn not in exception_summary:
                exception_summary[txn] = []
            exception_summary[txn].append(e['rule_id'])
        
        self.df['exception_rules'] = self.df['transaction_id'].map(
            lambda x: ';'.join(exception_summary.get(x, []))
        )
        
        # Save data with flags
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithExceptions.csv", index=False)
        
        # Save exception log
        if self.exception_results:
            exceptions_df = pd.DataFrame(self.exception_results)
            exceptions_df.to_csv(f"{Config.REPORTS_PATH}Exceptions_Detailed.csv", index=False)
        
        # Update master exceptions log
        master_exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        
        # Convert new exceptions to simple format
        new_exceptions = []
        for e in self.exception_results:
            new_exceptions.append({
                'transaction_id': e['transaction_id'],
                'anomaly_type': e['rule_id'],
                'severity': e['severity'],
                'description': e['description'],
                'amount': e.get('amount', 0)
            })
        
        if os.path.exists(master_exceptions_path):
            existing = pd.read_csv(master_exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(new_exceptions)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(new_exceptions)
        
        all_exceptions.to_csv(master_exceptions_path, index=False)
        
        print(f"   üíæ Saved exception data")
        
        return self.df, self.exception_results
    
    def run(self):
        """Execute all T005 steps"""
        print("\n" + "="*60)
        print("üöÄ T005: Detecting Exceptions")
        print("="*60)
        
        self.load_rulebook()
        self.detect_outliers()
        self.detect_temporal_anomalies()
        self.apply_rules()
        df, exceptions = self.save_output()
        
        # Severity counts
        if exceptions:
            severity_counts = {}
            for e in exceptions:
                sev = e.get('severity', 'UNKNOWN')
                severity_counts[sev] = severity_counts.get(sev, 0) + 1
            
            print(f"\n‚úÖ T005 Complete. Exceptions by severity:")
            for severity, count in severity_counts.items():
                print(f"   {severity}: {count}")
        else:
            print(f"\n‚úÖ T005 Complete. No exceptions found.")
        
        return df, exceptions

# ============================================================================
# T006: REVIEW HIGH SEVERITY EXCEPTIONS (Automated version - no human review)
# ============================================================================

class T006_ExceptionReviewer:
    """Task 6: Review and categorize exceptions (automated)"""
    
    def __init__(self, df, exceptions):
        self.df = df.copy()
        self.exceptions = exceptions
        self.critical_exceptions = []
        self.high_exceptions = []
        
    def categorize_exceptions(self):
        """Split exceptions by severity"""
        for e in self.exceptions:
            if e['severity'] == 'CRITICAL':
                self.critical_exceptions.append(e)
            elif e['severity'] == 'HIGH':
                self.high_exceptions.append(e)
        
        print(f"\nüìä T006: Exception Summary")
        print(f"   Critical: {len(self.critical_exceptions)}")
        print(f"   High: {len(self.high_exceptions)}")
        print(f"   Medium/Low: {len(self.exceptions) - len(self.critical_exceptions) - len(self.high_exceptions)}")
        
        return self
    
    def create_review_package(self):
        """Create automated review summary (no human pause)"""
        
        # Group critical exceptions by type
        critical_summary = {}
        for e in self.critical_exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in critical_summary:
                critical_summary[e_type] = {'count': 0, 'total_amount': 0, 'examples': []}
            
            critical_summary[e_type]['count'] += 1
            critical_summary[e_type]['total_amount'] += e.get('amount', 0)
            
            if len(critical_summary[e_type]['examples']) < 3:
                critical_summary[e_type]['examples'].append({
                    'transaction_id': e['transaction_id'],
                    'amount': e.get('amount', 0),
                    'description': e.get('description', '')
                })
        
        # Save review summary
        review_data = {
            'timestamp': datetime.now(),
            'total_critical': len(self.critical_exceptions),
            'total_high': len(self.high_exceptions),
            'critical_summary': critical_summary,
            'auto_approved': True,
            'note': 'Automated processing - no human review required'
        }
        
        # Save to file
        import json
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.json", 'w') as f:
            json.dump(review_data, f, indent=2, default=str)
        
        # Create a simple text summary
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.txt", 'w') as f:
            f.write("EXCEPTION REVIEW SUMMARY (Automated)\n")
            f.write("="*50 + "\n\n")
            f.write(f"Review Date: {datetime.now()}\n")
            f.write(f"Status: AUTO-APPROVED\n\n")
            
            f.write(f"CRITICAL EXCEPTIONS: {len(self.critical_exceptions)}\n")
            for e_type, data in critical_summary.items():
                f.write(f"  ‚Ä¢ {e_type}: {data['count']} occurrences, ${data['total_amount']:,.2f}\n")
            
            f.write(f"\nHIGH EXCEPTIONS: {len(self.high_exceptions)}\n")
        
        print(f"   üíæ Saved review summary to {Config.REPORTS_PATH}Exception_Review_Summary.txt")
        
        return review_data
    
    def run(self):
        """Execute T006 steps"""
        print("\n" + "="*60)
        print("üöÄ T006: Reviewing High Severity Exceptions")
        print("="*60)
        print("   ‚ö° Automated mode - no human review required")
        
        self.categorize_exceptions()
        review_data = self.create_review_package()
        
        print(f"\n‚úÖ T006 Complete. Proceeding with pipeline.")
        
        return self.df, review_data


# ============================================================================
# T007: COMPUTE BUDGET VARIANCE (FIXED DIVISION BY ZERO)
# ============================================================================

class T007_BudgetVariance:
    """Task 7: Calculate actual vs budget variance"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.budget_data = None
        self.variance_results = {}
        
    def load_budget(self):
        """Load budget data with proper column mapping"""
        print("\nüìÇ T007: Loading budget data...")
        
        try:
            self.budget_data = pd.read_csv(f"{Config.BUDGET_PATH}Budget_2026.csv")
            print(f"   Loaded budget data with {len(self.budget_data)} rows")
            
            # Standardize column names
            self.budget_data.columns = [col.lower().strip() for col in self.budget_data.columns]
            print(f"   Budget columns: {list(self.budget_data.columns)}")
            
            # Map period column
            period_col = None
            for col in ['fiscal_period', 'period', 'month', 'reporting_period']:
                if col in self.budget_data.columns:
                    period_col = col
                    break
            
            if period_col:
                self.budget_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                print(f"   ‚ö†Ô∏è No period column found, assuming all rows are for {Config.CURRENT_FISCAL_PERIOD}")
                self.budget_data['period'] = Config.CURRENT_FISCAL_PERIOD
            
            # Map account column
            account_col = None
            for col in ['account_code', 'account', 'gl_account', 'coa']:
                if col in self.budget_data.columns:
                    account_col = col
                    break
            
            if account_col:
                self.budget_data.rename(columns={account_col: 'account_code'}, inplace=True)
                print(f"   Using '{account_col}' as account column")
            
            # Map budget amount column
            budget_col = None
            for col in ['budget_amount_aud', 'budget_amount', 'budget', 'amount', 'planned_amount']:
                if col in self.budget_data.columns:
                    budget_col = col
                    break
            
            if budget_col:
                self.budget_data.rename(columns={budget_col: 'budget_amount'}, inplace=True)
                print(f"   Using '{budget_col}' as budget amount column")
                
                # Clean budget amounts (remove $, commas, etc.)
                self.budget_data['budget_amount'] = pd.to_numeric(
                    self.budget_data['budget_amount'].astype(str).str.replace('$', '').str.replace(',', ''),
                    errors='coerce'
                )
            else:
                print(f"   ‚ö†Ô∏è No budget amount column found, using synthetic data")
                self.budget_data['budget_amount'] = np.random.randint(50000, 200000, size=len(self.budget_data))
            
            # Ensure all key columns are string type for merging
            self.budget_data['period'] = self.budget_data['period'].astype(str)
            self.budget_data['account_code'] = self.budget_data['account_code'].astype(str)
            
            # Replace any zero or negative budget amounts with a small positive number to avoid division issues
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].replace(0, 0.01)
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].clip(lower=0.01)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Budget data not found or error loading: {e}")
            # Create sample budget
            accounts = self.df['account_code_mapped'].dropna().unique() if 'account_code_mapped' in self.df.columns else ['5000']
            
            budget_rows = []
            for account in accounts[:30]:
                budget_rows.append({
                    'account_code': str(account),
                    'period': Config.CURRENT_FISCAL_PERIOD,
                    'budget_amount': np.random.randint(50000, 200000)
                })
            
            self.budget_data = pd.DataFrame(budget_rows)
            print(f"   Created sample budget for {len(self.budget_data)} accounts")
        
        return self
    
    def calculate_variance(self):
        """Calculate variance by account, cost center, and overall"""
        
        # Filter to current period only
        current_period_df = self.df[
            (self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
            (self.df['amount_aud'].notna())
        ].copy()
        
        print(f"   Processing {len(current_period_df)} transactions for {Config.CURRENT_FISCAL_PERIOD}")
        
        # 1. Variance by Account
        account_actuals = current_period_df.groupby('account_code_mapped').agg({
            'amount_aud': 'sum',
            'transaction_id': 'count'
        }).rename(columns={
            'amount_aud': 'actual_amount',
            'transaction_id': 'transaction_count'
        }).reset_index()
        
        # Convert account codes to string for merging
        account_actuals['account_code_mapped'] = account_actuals['account_code_mapped'].astype(str)
        
        # Get budget for current period
        feb_budget = self.budget_data[self.budget_data['period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        if feb_budget.empty:
            print(f"   ‚ö†Ô∏è No budget found for period {Config.CURRENT_FISCAL_PERIOD}, using all budget data")
            feb_budget = self.budget_data.copy()
        
        # Ensure budget account codes are strings
        feb_budget['account_code'] = feb_budget['account_code'].astype(str)
        
        # Merge with budget
        if not account_actuals.empty and not feb_budget.empty:
            account_variance = pd.merge(
                account_actuals,
                feb_budget[['account_code', 'budget_amount']],
                left_on='account_code_mapped',
                right_on='account_code',
                how='outer'
            )
            
            account_variance['budget_amount'] = account_variance['budget_amount'].fillna(0.01)
            account_variance['actual_amount'] = account_variance['actual_amount'].fillna(0)
            account_variance['variance'] = account_variance['actual_amount'] - account_variance['budget_amount']
            
            # Safe variance percentage calculation (handle division by zero)
            def safe_variance_pct(row):
                if row['budget_amount'] > 0:
                    return (row['variance'] / row['budget_amount']) * 100
                elif row['actual_amount'] > 0:
                    # If budget is zero but there are actuals, it's infinite variance
                    return 999999  # Large number to indicate infinite
                else:
                    return 0
            
            account_variance['variance_pct'] = account_variance.apply(safe_variance_pct, axis=1)
            
            # Clean up columns
            account_variance = account_variance.drop(columns=['account_code'], errors='ignore')
            account_variance = account_variance.rename(columns={'account_code_mapped': 'account_code'})
        else:
            account_variance = pd.DataFrame()
        
        # 2. Variance by Cost Center
        if 'cost_center_mapped' in current_period_df.columns:
            cc_actuals = current_period_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).rename(columns={
                'amount_aud': 'actual_amount',
                'transaction_id': 'transaction_count'
            }).reset_index()
            
            cc_actuals = cc_actuals[cc_actuals['cost_center_mapped'].notna()]
        else:
            cc_actuals = pd.DataFrame()
        
        # 3. Suspense amounts (invalid accounts)
        suspense_amount = current_period_df[
            current_period_df['account_code_mapped'].isna()
        ]['amount_aud'].sum()
        
        # 4. Future dated amounts
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)
        future_amount = current_period_df[
            current_period_df['posting_date'] > current_date
        ]['amount_aud'].sum()
        
        # 5. Total actual and budget
        total_actual = current_period_df['amount_aud'].sum()
        total_budget = feb_budget['budget_amount'].sum() if not feb_budget.empty else 0.01
        
        # Safe total variance calculation
        total_variance = total_actual - total_budget
        if total_budget > 0:
            total_variance_pct = (total_variance / total_budget) * 100
        elif total_actual > 0:
            total_variance_pct = 999999  # Infinite variance
        else:
            total_variance_pct = 0
        
        # Store results
        self.variance_results = {
            'by_account': account_variance.to_dict('records') if not account_variance.empty else [],
            'by_cost_center': cc_actuals.to_dict('records') if not cc_actuals.empty else [],
            'suspense_amount': suspense_amount,
            'future_dated_amount': future_amount,
            'total_actual': total_actual,
            'total_budget': total_budget,
            'total_variance': total_variance,
            'total_variance_pct': total_variance_pct,
            'transaction_count': len(current_period_df),
            'exception_count': current_period_df['has_exception'].sum() if 'has_exception' in current_period_df.columns else 0
        }
        
        print(f"\n   Variance Summary:")
        print(f"   Total Actual: ${total_actual:,.2f}")
        print(f"   Total Budget: ${total_budget:,.2f}")
        print(f"   Variance: ${total_variance:,.2f} ({total_variance_pct:.1f}%)")
        print(f"   Suspense (invalid accounts): ${suspense_amount:,.2f}")
        print(f"   Future dated: ${future_amount:,.2f}")
        
        return self
    
    def save_output(self):
        """Save variance results"""
        
        # Save detailed variance by account
        if self.variance_results['by_account']:
            pd.DataFrame(self.variance_results['by_account']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_Account.csv", index=False
            )
        
        # Save variance by cost center
        if self.variance_results['by_cost_center']:
            pd.DataFrame(self.variance_results['by_cost_center']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_CostCenter.csv", index=False
            )
        
        # Save summary
        summary_df = pd.DataFrame([{
            'metric': 'Total Actual',
            'value': self.variance_results['total_actual']
        }, {
            'metric': 'Total Budget',
            'value': self.variance_results['total_budget']
        }, {
            'metric': 'Variance',
            'value': self.variance_results['total_variance']
        }, {
            'metric': 'Variance %',
            'value': self.variance_results['total_variance_pct']
        }, {
            'metric': 'Suspense Amount',
            'value': self.variance_results['suspense_amount']
        }, {
            'metric': 'Future Dated Amount',
            'value': self.variance_results['future_dated_amount']
        }, {
            'metric': 'Transaction Count',
            'value': self.variance_results['transaction_count']
        }, {
            'metric': 'Exception Count',
            'value': self.variance_results['exception_count']
        }])
        
        summary_df.to_csv(f"{Config.REPORTS_PATH}Budget_Variance_Summary.csv", index=False)
        
        print(f"   üíæ Saved variance reports to {Config.REPORTS_PATH}")
        
        return self.variance_results
    
    def run(self):
        """Execute T007 steps"""
        print("\n" + "="*60)
        print("üöÄ T007: Computing Budget Variance")
        print("="*60)
        
        self.load_budget()
        self.calculate_variance()
        results = self.save_output()
        
        print(f"\n‚úÖ T007 Complete.")
        
        return results


# ============================================================================
# T008: GENERATE CLOSE PACK REPORT
# ============================================================================

class T008_ClosePackReport:
    """Task 8: Create comprehensive month-end close report"""
    
    def __init__(self, df, variance_results, exceptions):
        self.df = df.copy()
        self.variance = variance_results
        self.exceptions = exceptions
        self.report_data = {}
        
    def generate_report(self):
        """Generate comprehensive close pack"""
        print("\nüìù T008: Generating Close Pack Report")
        
        # Filter to current period
        current_df = self.df[self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        # 1. Executive Summary
        self.report_data['executive_summary'] = {
            'period': Config.CURRENT_FISCAL_PERIOD,
            'generated_date': datetime.now(),
            'total_transactions': len(current_df),
            'total_spend': self.variance.get('total_actual', 0),
            'total_budget': self.variance.get('total_budget', 0),
            'variance': self.variance.get('total_variance', 0),
            'variance_pct': self.variance.get('total_variance_pct', 0),
            'exception_count': len(self.exceptions),
            'critical_exception_count': len([e for e in self.exceptions if e.get('severity') == 'CRITICAL']),
            'data_quality_score': current_df['data_quality_score'].iloc[0] if 'data_quality_score' in current_df.columns and len(current_df) > 0 else 85
        }
        
        # 2. Top exceptions
        exception_counts = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_counts:
                exception_counts[e_type] = {'count': 0, 'total_amount': 0}
            exception_counts[e_type]['count'] += 1
            exception_counts[e_type]['total_amount'] += e.get('amount', 0)
        
        self.report_data['top_exceptions'] = sorted(
            [{'type': k, **v} for k, v in exception_counts.items()],
            key=lambda x: x['total_amount'],
            reverse=True
        )[:10]
        
        # 3. Top vendors by spend - check if vendor_canonical exists
        if 'vendor_canonical' in current_df.columns:
            vendor_spend = current_df.groupby('vendor_canonical').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
        else:
            # Fallback to vendor_name_raw
            vendor_spend = current_df.groupby('vendor_name_raw').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
            vendor_spend.rename(columns={'vendor_name_raw': 'vendor_canonical'}, inplace=True)
        
        self.report_data['top_vendors'] = vendor_spend.to_dict('records')
        
        # 4. Account summary - FIX: Check if account_description exists
        if 'account_description' in current_df.columns:
            account_summary = current_df.groupby(['account_code_mapped', 'account_description']).agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            # Group by account code only
            account_summary = current_df.groupby('account_code_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
            # Add placeholder description
            account_summary['account_description'] = 'Unknown'
        
        self.report_data['account_summary'] = account_summary.to_dict('records')
        
        # 5. Cost center summary
        if 'cost_center_mapped' in current_df.columns:
            cc_summary = current_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            cc_summary = pd.DataFrame(columns=['cost_center_mapped', 'amount_aud', 'transaction_id'])
        
        self.report_data['cost_center_summary'] = cc_summary.to_dict('records')
        
        # 6. Currency exposure
        if 'currency_code' in current_df.columns and 'amount_aud' in current_df.columns:
            currency_summary = current_df.groupby('currency_code').agg({
                'amount': 'sum',
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index()
        else:
            currency_summary = pd.DataFrame(columns=['currency_code', 'amount', 'amount_aud', 'transaction_id'])
        
        self.report_data['currency_summary'] = currency_summary.to_dict('records')
        
        # 7. Source system breakdown
        if 'source_system' in current_df.columns:
            source_summary = current_df.groupby('source_system').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            source_summary = pd.DataFrame(columns=['source_system', 'amount_aud', 'transaction_id'])
        
        self.report_data['source_summary'] = source_summary.to_dict('records')
        
        print(f"   Generated report with {len(self.report_data)} sections")
        return self
    
    def save_report(self):
        """Save report in multiple formats"""
        
        # Save as CSV (tabular)
        pd.DataFrame([self.report_data['executive_summary']]).to_csv(
            f"{Config.REPORTS_PATH}Close_Pack_Executive_Summary.csv", index=False
        )
        
        if self.report_data['top_vendors']:
            pd.DataFrame(self.report_data['top_vendors']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Top_Vendors.csv", index=False
            )
        
        if self.report_data['account_summary']:
            pd.DataFrame(self.report_data['account_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Account_Summary.csv", index=False
            )
        
        if self.report_data['cost_center_summary']:
            pd.DataFrame(self.report_data['cost_center_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Cost_Center_Summary.csv", index=False
            )
        
        if self.report_data['currency_summary']:
            pd.DataFrame(self.report_data['currency_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Currency_Summary.csv", index=False
            )
        
        if self.report_data.get('source_summary'):
            pd.DataFrame(self.report_data['source_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Source_Summary.csv", index=False
            )
        
        # Save as text report
        with open(f"{Config.REPORTS_PATH}MonthEnd_Close_Pack_Feb2026.txt", 'w') as f:
            f.write("="*80 + "\n")
            f.write(f"MONTH-END CLOSE PACK - {Config.CURRENT_FISCAL_PERIOD}\n")
            f.write("="*80 + "\n\n")
            
            # Executive Summary
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-"*40 + "\n")
            f.write(f"Period: {self.report_data['executive_summary']['period']}\n")
            f.write(f"Generated: {self.report_data['executive_summary']['generated_date']}\n")
            f.write(f"Total Transactions: {self.report_data['executive_summary']['total_transactions']:,}\n")
            f.write(f"Total Spend: ${self.report_data['executive_summary']['total_spend']:,.2f}\n")
            f.write(f"Total Budget: ${self.report_data['executive_summary']['total_budget']:,.2f}\n")
            f.write(f"Variance: ${self.report_data['executive_summary']['variance']:,.2f} ")
            f.write(f"({self.report_data['executive_summary']['variance_pct']:.1f}%)\n")
            f.write(f"Data Quality Score: {self.report_data['executive_summary']['data_quality_score']:.1f}/100\n\n")
            
            # Top Exceptions
            f.write("TOP EXCEPTIONS BY VALUE\n")
            f.write("-"*40 + "\n")
            for e in self.report_data['top_exceptions'][:5]:
                f.write(f"‚Ä¢ {e['type']}: {e['count']} occurrences, ${e['total_amount']:,.2f}\n")
            f.write("\n")
            
            # Top Vendors
            f.write("TOP 10 VENDORS\n")
            f.write("-"*40 + "\n")
            for v in self.report_data['top_vendors'][:10]:
                vendor_name = v.get('vendor_canonical', v.get('vendor_name_raw', 'Unknown'))
                f.write(f"‚Ä¢ {vendor_name}: ${v['amount_aud']:,.2f} ({v['transaction_id']} txns)\n")
            f.write("\n")
            
            # Currency Exposure
            f.write("CURRENCY EXPOSURE\n")
            f.write("-"*40 + "\n")
            for c in self.report_data['currency_summary']:
                f.write(f"‚Ä¢ {c['currency_code']}: {c['transaction_id']} txns, ")
                f.write(f"Original: ${c.get('amount', 0):,.2f}, AUD: ${c['amount_aud']:,.2f}\n")
            
            # Source Systems
            if self.report_data.get('source_summary'):
                f.write("\nSOURCE SYSTEMS\n")
                f.write("-"*40 + "\n")
                for s in self.report_data['source_summary'][:5]:
                    f.write(f"‚Ä¢ {s['source_system']}: ${s['amount_aud']:,.2f} ({s['transaction_id']} txns)\n")
        
        print(f"   üíæ Saved reports to {Config.REPORTS_PATH}")
        
        return self.report_data
    
    def run(self):
        """Execute T008 steps"""
        print("\n" + "="*60)
        print("üöÄ T008: Generating Close Pack Report")
        print("="*60)
        
        self.generate_report()
        report = self.save_report()
        
        print(f"\n‚úÖ T008 Complete. Report saved.")
        
        return report


# ============================================================================
# T009: GENERATE EXECUTIVE NARRATIVE (Rule-based, no LLM)
# ============================================================================

class T009_ExecutiveNarrative:
    """Task 9: Create natural language summary (rule-based, no LLM)"""
    
    def __init__(self, variance_results, report_data, exceptions):
        self.variance = variance_results
        self.report = report_data
        self.exceptions = exceptions
        self.narrative = ""
        
    def generate_narrative(self):
        """Generate narrative using templates and rules"""
        print("\nüìù T009: Generating Executive Narrative")
        
        lines = []
        
        # Header
        lines.append("="*80)
        lines.append(f"EXECUTIVE NARRATIVE - {Config.CURRENT_FISCAL_PERIOD}")
        lines.append("="*80)
        lines.append("")
        
        # Financial Summary
        lines.append("FINANCIAL SUMMARY")
        lines.append("-"*40)
        
        variance_pct = self.variance['total_variance_pct']
        if abs(variance_pct) < 2:
            variance_desc = "in line with"
        elif variance_pct > 0:
            if variance_pct > 10:
                variance_desc = "significantly above"
            else:
                variance_desc = "moderately above"
        else:
            if variance_pct < -10:
                variance_desc = "significantly below"
            else:
                variance_desc = "moderately below"
        
        lines.append(f"Total spend for {Config.CURRENT_FISCAL_PERIOD} was ${self.variance['total_actual']:,.2f}, "
                    f"which is {variance_desc} budget of ${self.variance['total_budget']:,.2f}. "
                    f"The variance is ${abs(self.variance['total_variance']):,.2f} ({variance_pct:.1f}%).")
        lines.append("")
        
        # Key Drivers
        lines.append("KEY VARIANCE DRIVERS")
        lines.append("-"*40)
        
        # Find largest variances from account data
        account_variances = self.variance['by_account']
        top_pos = sorted([a for a in account_variances if a.get('variance', 0) > 0], 
                         key=lambda x: x['variance'], reverse=True)[:3]
        top_neg = sorted([a for a in account_variances if a.get('variance', 0) < 0], 
                         key=lambda x: x['variance'])[:3]
        
        if top_pos:
            lines.append("Positive variances (over budget):")
            for a in top_pos:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: +${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        
        if top_neg:
            lines.append("Negative variances (under budget):")
            for a in top_neg:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: ${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        lines.append("")
        
        # Exception Summary
        lines.append("EXCEPTION SUMMARY")
        lines.append("-"*40)
        
        critical_count = len([e for e in self.exceptions if e.get('severity') == 'CRITICAL'])
        high_count = len([e for e in self.exceptions if e.get('severity') == 'HIGH'])
        medium_count = len([e for e in self.exceptions if e.get('severity') == 'MEDIUM'])
        
        lines.append(f"Total exceptions: {len(self.exceptions)}")
        lines.append(f"  ‚Ä¢ Critical: {critical_count}")
        lines.append(f"  ‚Ä¢ High: {high_count}")
        lines.append(f"  ‚Ä¢ Medium: {medium_count}")
        
        # Top exception types
        exception_types = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_types:
                exception_types[e_type] = 0
            exception_types[e_type] += 1
        
        top_types = sorted(exception_types.items(), key=lambda x: x[1], reverse=True)[:3]
        if top_types:
            lines.append("\nMost common exceptions:")
            for e_type, count in top_types:
                lines.append(f"  ‚Ä¢ {e_type}: {count} occurrences")
        lines.append("")
        
        # Data Quality Impact
        lines.append("DATA QUALITY IMPACT")
        lines.append("-"*40)
        
        suspense_amount = self.variance.get('suspense_amount', 0)
        future_amount = self.variance.get('future_dated_amount', 0)
        total_impact = suspense_amount + future_amount
        impact_pct = (total_impact / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Transactions with data quality issues: ${total_impact:,.2f} ({impact_pct:.1f}% of total)")
        if suspense_amount > 0:
            lines.append(f"  ‚Ä¢ Invalid accounts (in suspense): ${suspense_amount:,.2f}")
        if future_amount > 0:
            lines.append(f"  ‚Ä¢ Future-dated transactions: ${future_amount:,.2f}")
        lines.append("")
        
        # Currency Impact
        lines.append("CURRENCY EXPOSURE")
        lines.append("-"*40)
        
        non_aud_total = sum(c['amount_aud'] for c in self.report['currency_summary'] 
                           if c['currency_code'] != 'AUD')
        non_aud_pct = (non_aud_total / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Foreign currency exposure: ${non_aud_total:,.2f} ({non_aud_pct:.1f}% of total)")
        
        # Top non-AUD currencies
        for c in self.report['currency_summary']:
            if c['currency_code'] != 'AUD' and c['amount_aud'] > 0:
                lines.append(f"  ‚Ä¢ {c['currency_code']}: ${c['amount_aud']:,.2f}")
        lines.append("")
        
        # Recommendations
        lines.append("RECOMMENDATIONS")
        lines.append("-"*40)
        
        if suspense_amount > 10000:
            lines.append("‚Ä¢ Review and remap transactions with invalid account codes")
        if future_amount > 10000:
            lines.append("‚Ä¢ Reclassify future-dated transactions to correct period")
        if critical_count > 0:
            lines.append("‚Ä¢ Investigate critical exceptions before next close")
        if len(self.exceptions) > 100:
            lines.append("‚Ä¢ Schedule data quality workshop to address root causes")
        
        # Join all lines
        self.narrative = "\n".join(lines)
        
        print(f"   Generated {len(lines)} lines of narrative")
        return self
    
    def save_narrative(self):
        """Save narrative to file"""
        with open(f"{Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt", 'w') as f:
            f.write(self.narrative)
        
        print(f"   üíæ Saved narrative to {Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt")
        
        return self.narrative
    
    def run(self):
        """Execute T009 steps"""
        print("\n" + "="*60)
        print("üöÄ T009: Generating Executive Narrative")
        print("="*60)
        
        self.generate_narrative()
        narrative = self.save_narrative()
        
        print(f"\n‚úÖ T009 Complete.")
        
        return narrative


# ============================================================================
# T010: FORECAST NEXT PERIOD
# ============================================================================

# ============================================================================
# T010: FORECAST NEXT PERIOD (FIXED)
# ============================================================================

class T010_Forecast:
    """Task 10: Generate forecast for next period based on historical trends"""
    
    def __init__(self, df, variance_results):
        self.df = df
        self.variance = variance_results
        self.historical_data = None
        self.forecast = {}
        
    def load_historical(self):
        """Load historical KPI data"""
        print("\nüìÇ T010: Loading historical data...")
        
        try:
            self.historical_data = pd.read_csv(f"{Config.REFERENCE_PATH}KPI_Monthly_History.csv")
            print(f"   Loaded {len(self.historical_data)} rows of historical data")
            
            # Standardize column names
            self.historical_data.columns = [col.lower().strip() for col in self.historical_data.columns]
            
            # Check for period column and rename if needed
            period_col = None
            for col in ['period', 'month', 'fiscal_period', 'reporting_period', 'date', 'year_month']:
                if col in self.historical_data.columns:
                    period_col = col
                    break
            
            if period_col:
                if period_col != 'period':
                    self.historical_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                # Create a synthetic period column if none exists
                print(f"   ‚ö†Ô∏è No period column found, creating synthetic periods")
                self.historical_data['period'] = [f"2025-{i:02d}" for i in range(1, len(self.historical_data) + 1)]
            
            # Check for spend column and rename if needed
            spend_col = None
            for col in ['total_spend', 'spend', 'amount', 'actual', 'value', 'total']:
                if col in self.historical_data.columns:
                    spend_col = col
                    break
            
            if spend_col:
                if spend_col != 'total_spend':
                    self.historical_data.rename(columns={spend_col: 'total_spend'}, inplace=True)
                print(f"   Using '{spend_col}' as spend column")
            else:
                # Create synthetic spend data
                print(f"   ‚ö†Ô∏è No spend column found, creating synthetic data")
                base_spend = self.variance.get('total_actual', 1000000)
                self.historical_data['total_spend'] = [
                    base_spend * (0.8 + 0.4 * np.random.random()) 
                    for _ in range(len(self.historical_data))
                ]
            
            print(f"   Historical data columns: {list(self.historical_data.columns)}")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Historical data not found or error loading: {e}")
            # Create synthetic history from current data
            months = []
            base_spend = self.variance.get('total_actual', 1000000)
            base_count = self.variance.get('transaction_count', 1000)
            
            for i in range(1, 13):
                month_num = Config.CURRENT_MONTH - (12 - i)
                year = Config.CURRENT_YEAR
                if month_num <= 0:
                    month_num += 12
                    year -= 1
                
                month = f"{year}-{month_num:02d}"
                months.append({
                    'period': month,
                    'total_spend': base_spend * (0.8 + 0.4 * np.random.random()),
                    'transaction_count': int(base_count * (0.8 + 0.4 * np.random.random()))
                })
            self.historical_data = pd.DataFrame(months)
            print(f"   Created synthetic historical data for {len(self.historical_data)} months")
        
        # Ensure period is string type for sorting
        self.historical_data['period'] = self.historical_data['period'].astype(str)
        
        return self
    
    def calculate_trends(self):
        """Calculate trends from historical data"""
        
        # Sort by period
        try:
            self.historical_data = self.historical_data.sort_values('period')
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error sorting by period: {e}")
            # If sorting fails, assume data is already in order
            pass
        
        # Calculate moving averages
        if len(self.historical_data) >= 3:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend'].rolling(3, min_periods=1).mean()
        else:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend']
        
        # Calculate growth rate
        if len(self.historical_data) >= 2:
            self.historical_data['growth_rate'] = self.historical_data['total_spend'].pct_change()
            avg_growth = self.historical_data['growth_rate'].mean()
            # Handle NaN
            if pd.isna(avg_growth):
                avg_growth = 0.02
        else:
            avg_growth = 0.02  # Default 2% growth
        
        # Recent trend (last 3 months)
        recent_data = self.historical_data.tail(min(3, len(self.historical_data)))
        recent_avg = recent_data['total_spend'].mean()
        
        if len(recent_data) >= 2:
            recent_growth = recent_data['growth_rate'].mean()
        else:
            recent_growth = avg_growth
        
        # Seasonal adjustment (if we have same month last year)
        current_month_str = f"{Config.CURRENT_MONTH:02d}"
        last_year_data = self.historical_data[
            self.historical_data['period'].str.endswith(current_month_str)
        ]
        
        if not last_year_data.empty and recent_avg > 0:
            seasonal_factor = last_year_data['total_spend'].iloc[0] / recent_avg
        else:
            seasonal_factor = 1.0
        
        # Calculate forecast for next period
        if Config.CURRENT_MONTH < 12:
            next_period = f"{Config.CURRENT_YEAR}-{Config.CURRENT_MONTH+1:02d}"
            next_month_num = Config.CURRENT_MONTH + 1
            next_year = Config.CURRENT_YEAR
        else:
            next_period = f"{Config.CURRENT_YEAR+1}-01"
            next_month_num = 1
            next_year = Config.CURRENT_YEAR + 1
        
        # Base forecast on recent average with growth and seasonal adjustment
        base_forecast = recent_avg * (1 + recent_growth) * seasonal_factor
        
        # Adjust based on current month actual
        current_actual = self.variance.get('total_actual', base_forecast)
        recent_avg = recent_avg if recent_avg > 0 else current_actual
        
        # Blend current and historical (70% recent trend, 30% current month with growth)
        blended_forecast = 0.7 * base_forecast + 0.3 * current_actual * 1.05  # Assume 5% growth
        
        # Calculate confidence interval
        if len(self.historical_data) > 1:
            std_dev = self.historical_data['total_spend'].std()
            margin = 1.96 * std_dev / np.sqrt(len(self.historical_data))
        else:
            std_dev = blended_forecast * 0.1
            margin = blended_forecast * 0.2
        
        lower_bound = max(0, blended_forecast - margin)
        upper_bound = blended_forecast + margin
        
        self.forecast = {
            'next_period': next_period,
            'next_month': next_month_num,
            'next_year': next_year,
            'forecast_amount': blended_forecast,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'confidence_level': 0.95,
            'method': 'Blended (70% trend, 30% current)',
            'historical_months_used': len(self.historical_data),
            'avg_growth_rate': avg_growth,
            'seasonal_factor': seasonal_factor,
            'current_actual': current_actual,
            'recent_avg': recent_avg
        }
        
        print(f"\n   Forecast for {next_period}:")
        print(f"   Point forecast: ${self.forecast['forecast_amount']:,.2f}")
        print(f"   95% CI: (${self.forecast['lower_bound']:,.2f} - ${self.forecast['upper_bound']:,.2f})")
        
        return self
    
    def save_forecast(self):
        """Save forecast results"""
        
        # Save as CSV
        forecast_df = pd.DataFrame([self.forecast])
        forecast_df.to_csv(f"{Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv", index=False)
        
        # Save detailed forecast with account-level breakdown
        if 'by_account' in self.variance and self.variance['by_account']:
            account_proportions = []
            total_actual = self.variance.get('total_actual', 0)
            
            if total_actual > 0:
                for a in self.variance['by_account']:
                    if a.get('actual_amount', 0) > 0:
                        proportion = a['actual_amount'] / total_actual
                        account_proportions.append({
                            'account_code': a.get('account_code_mapped', a.get('account_code', 'UNKNOWN')),
                            'account_description': a.get('account_description', 'Unknown'),
                            'current_actual': a['actual_amount'],
                            'forecast_proportion': proportion,
                            'forecast_amount': proportion * self.forecast['forecast_amount']
                        })
                
                if account_proportions:
                    pd.DataFrame(account_proportions).to_csv(
                        f"{Config.REPORTS_PATH}Forecast_By_Account_{self.forecast['next_period'].replace('-', '')}.csv", 
                        index=False
                    )
        
        print(f"   üíæ Saved forecast to {Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv")
        
        return self.forecast
    
    def run(self):
        """Execute T010 steps"""
        print("\n" + "="*60)
        print("üöÄ T010: Forecasting Next Period")
        print("="*60)
        
        self.load_historical()
        self.calculate_trends()
        forecast = self.save_forecast()
        
        print(f"\n‚úÖ T010 Complete.")
        
        return forecast
    

# Add this class before the main pipeline

# ============================================================================
# IMPROVED DATA VALIDATOR (FIXED MESSAGE)
# ============================================================================

class DataValidator:
    """Validate that all required data files exist and are properly formatted"""
    
    @staticmethod
    def validate_all():
        """Run all validations"""
        issues = []
        
        # Check master data files
        required_files = {
            f"{Config.MASTER_DATA_PATH}Master_COA.csv": "Chart of Accounts",
            f"{Config.MASTER_DATA_PATH}Master_Entity.csv": "Entity Master",
            f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv": "Cost Center Master",
            f"{Config.BUDGET_PATH}Budget_2026.csv": "Budget Data"
        }
        
        print("\nüìä DATA VALIDATION")
        print("-" * 40)
        
        for filepath, description in required_files.items():
            if not os.path.exists(filepath):
                issues.append(f"‚ùå Missing {description}: {filepath}")
            else:
                try:
                    df = pd.read_csv(filepath)
                    print(f"‚úÖ {description}: {len(df)} rows")
                    print(f"   Columns: {list(df.columns)}")
                    
                    # Special checks for Master_COA.csv
                    if "Master_COA.csv" in filepath:
                        # Check for account code column variations
                        possible_cols = ['Account_Code', 'account_code', 'AccountCode', 'Account', 'CODE']
                        found_col = None
                        for col in possible_cols:
                            if col in df.columns:
                                print(f"   ‚úì Found account code column: '{col}'")
                                found_col = col
                                break
                        if not found_col:
                            issues.append(f"   ‚ùå No account code column found in {filepath}. Found: {list(df.columns)}")
                            
                except Exception as e:
                    issues.append(f"‚ùå Cannot read {description}: {e}")
        
        if issues:
            print("\n‚ö†Ô∏è DATA VALIDATION ISSUES FOUND:")
            for issue in issues:
                print(issue)
            print("\n‚úÖ Pipeline will continue but may use synthetic data where needed.\n")
            return False
        else:
            print("\n‚úÖ All master data files validated successfully.\n")
            return True


# ============================================================================
# MAIN PIPELINE EXECUTION
# ============================================================================

class FinancialCloseAgent:
    """Main agent orchestrating all tasks"""
    
    def __init__(self):
        self.results = {}
        self.start_time = datetime.now()
        
    def run_pipeline(self):
        """Execute all tasks in sequence"""
        print("\n" + "="*80)
        print("üöÄ FINANCIAL CLOSE AGENT PIPELINE")
        print(f"   Started: {self.start_time}")
        print("="*80 + "\n")


        # Validate data files
        validator = DataValidator()
        if not validator.validate_all():
            print("‚ö†Ô∏è Some data validation issues were found. Proceeding with synthetic data generation.")
        
        # Task 001: Wrangle Raw Data
        wrangler = T001_DataWrangler()
        df, anomalies = wrangler.run(Config.RAW_DATA_PATH)
        self.results['df_t001'] = df
        self.results['anomalies'] = anomalies
        
        # Task 002: Map Entities and Accounts
        mapper = T002_EntityAccountMapper(df)
        df = mapper.run()
        self.results['df_t002'] = df
        
        # Task 003: Resolve Vendors
        resolver = T003_VendorResolver(df)
        df = resolver.run()
        self.results['df_t003'] = df
        
        # Task 004: FX Conversion
        converter = T004_FXConverter(df)
        df = converter.run()
        self.results['df_t004'] = df
        
        # Task 005: Detect Exceptions
        detector = T005_ExceptionDetector(df)
        df, exceptions = detector.run()
        self.results['df_t005'] = df
        self.results['exceptions'] = exceptions
        
        # Task 006: Review Exceptions (Automated)
        reviewer = T006_ExceptionReviewer(df, exceptions)
        df, review = reviewer.run()
        self.results['df_t006'] = df
        self.results['review'] = review
        
        # Task 007: Budget Variance
        variance = T007_BudgetVariance(df)
        variance_results = variance.run()
        self.results['variance'] = variance_results

                # Add this after T007 completes to analyze budget coverage
        print("\nüìä BUDGET COVERAGE ANALYSIS")
        print("-" * 40)
        # Get unique accounts with activity in Feb 2026
        active_accounts = df[df['fiscal_period'] == '2026-02']['account_code_mapped'].dropna().unique()
        print(f"Active accounts in Feb: {len(active_accounts)}")

        # Get accounts with budget in Feb 2026
        budget_accounts = budget_data[budget_data['period'] == '2026-02']['account_code'].unique()
        print(f"Accounts with budget: {len(budget_accounts)}")

        # Find accounts missing budget
        missing_budget = set(active_accounts) - set(budget_accounts)
        if missing_budget:
            print(f"‚ö†Ô∏è {len(missing_budget)} active accounts have no budget")
            print(f"Sample: {list(missing_budget)[:5]}")
        
        # Task 008: Close Pack Report
        report = T008_ClosePackReport(df, variance_results, exceptions)
        report_data = report.run()
        self.results['report'] = report_data
        
        # Task 009: Executive Narrative
        narrative = T009_ExecutiveNarrative(variance_results, report_data, exceptions)
        narrative_text = narrative.run()
        self.results['narrative'] = narrative_text
        
        # Task 010: Forecast
        forecast = T010_Forecast(df, variance_results)
        forecast_data = forecast.run()
        self.results['forecast'] = forecast_data
        
        # Completion
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()
        
        print("\n" + "="*80)
        print("‚úÖ PIPELINE COMPLETE")
        print(f"   Finished: {end_time}")
        print(f"   Duration: {duration:.2f} seconds")
        print("="*80)
        
        return self.results


# ============================================================================
# EXECUTE THE PIPELINE
# ============================================================================

if __name__ == "__main__":
    # Create directories if they don't exist
    for path in [Config.OUTPUT_PATH, Config.REPORTS_PATH]:
        os.makedirs(path, exist_ok=True)
    
    # Run the agent
    agent = FinancialCloseAgent()
    results = agent.run_pipeline()
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä FINAL SUMMARY")
    print("="*80)
    print(f"Total transactions processed: {len(results['df_t001'])}")
    print(f"Total exceptions found: {len(results['exceptions'])}")
    print(f"Critical exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'CRITICAL'])}")
    print(f"High exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'HIGH'])}")
    print(f"Total spend: ${results['variance']['total_actual']:,.2f}")
    print(f"Budget variance: ${results['variance']['total_variance']:,.2f} ({results['variance']['total_variance_pct']:.1f}%)")
    print(f"Suspense amount (invalid accounts): ${results['variance']['suspense_amount']:,.2f}")
    print(f"Forecast for next period: ${results['forecast']['forecast_amount']:,.2f}")
    print("\nOutput files saved to:")
    print(f"  ‚Ä¢ Working data: {Config.OUTPUT_PATH}")
    print(f"  ‚Ä¢ Reports: {Config.REPORTS_PATH}")
    print("="*80)


üöÄ FINANCIAL CLOSE AGENT PIPELINE
   Started: 2026-02-22 22:58:45.484493


üìä DATA VALIDATION
----------------------------------------
‚úÖ Chart of Accounts: 28 rows
   Columns: ['Account_Code', 'Account_Name', 'Account_Type', 'Category', 'Active']
   ‚úì Found account code column: 'Account_Code'
‚úÖ Entity Master: 1 rows
   Columns: ['Entity', 'Entity_Name', 'Country', 'Currency', 'Active']
‚úÖ Cost Center Master: 10 rows
   Columns: ['Cost_Center', 'Cost_Center_Name', 'Department', 'Manager', 'Active']
‚úÖ Budget Data: 60 rows
   Columns: ['Fiscal_Period', 'Entity', 'Account_Code', 'Cost_Center', 'Budget_Amount_AUD', 'Budget_Type', 'Notes']

‚úÖ All master data files validated successfully.


üöÄ T001: Wrangling Raw GL Data
üìÇ T001: Loading raw GL data...
   Loaded 4080 rows
   ‚úì Column names standardized
   ‚úì Dates standardized. Invalid dates: 48
   ‚úì Amounts cleaned. Negative amounts: 96
   ‚úì Embedded exceptions detected: 0
   üíæ Saved 4080 rows to working/GL_Stan

In [13]:
"""
Financial Close Agent - Complete Pipeline
Processes Raw GL Export through all 10 tasks without human intervention
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION AND SETUP
# ============================================================================

class Config:
    """Configuration settings for the agent"""
    RAW_DATA_PATH = "Raw_GL_Export.csv"
    MASTER_DATA_PATH = "Master_Data/"
    REFERENCE_PATH = "Reference/"
    BUDGET_PATH = "Budget/"
    OUTPUT_PATH = "working/"
    REPORTS_PATH = "reports/"
    
    # Fiscal period settings
    CURRENT_FISCAL_PERIOD = "2026-02"
    CURRENT_MONTH = 2
    CURRENT_YEAR = 2026
    
    # Anomaly thresholds
    HIGH_VALUE_THRESHOLD = 50000
    EXTREME_OUTLIER_MULTIPLIER = 5
    SUSPICIOUS_HOUR_START = 22
    SUSPICIOUS_HOUR_END = 6

# ============================================================================
# T001: WRANGLE RAW GL DATA
# ============================================================================

class T001_DataWrangler:
    """Task 1: Parse and standardize raw GL export data"""
    
    def __init__(self):
        self.raw_df = None
        self.standardized_df = None
        self.anomaly_log = []
        
    def load_raw_data(self, filepath):
        """Load raw CSV file"""
        print("üìÇ T001: Loading raw GL data...")
        self.raw_df = pd.read_csv(filepath)
        print(f"   Loaded {len(self.raw_df)} rows")
        return self
    
    def standardize_column_names(self):
        """Convert column names to snake_case"""
        column_mapping = {
            'Txn_ID': 'transaction_id',
            'Posting_Date_Raw': 'posting_date_raw',
            'Invoice_Date_Raw': 'invoice_date_raw',
            'Fiscal_Period': 'fiscal_period',
            'Entity': 'entity_code',
            'Account_Code_Raw': 'account_code_raw',
            'Cost_Center_Raw': 'cost_center_raw',
            'Vendor_Name_Raw': 'vendor_name_raw',
            'Invoice_Number': 'invoice_number',
            'PO_Number': 'po_number',
            'Currency': 'currency_code',
            'Amount': 'amount_raw',
            'Tax_Code': 'tax_code',
            'Narrative': 'narrative',
            'Source_System': 'source_system'
        }
        self.standardized_df = self.raw_df.rename(columns=column_mapping)
        print("   ‚úì Column names standardized")
        return self
    
    def standardize_dates(self):
        """Convert all dates to consistent format YYYY-MM-DD"""
        df = self.standardized_df
        
        def parse_date(date_str, txn_id, column_name):
            if pd.isna(date_str) or date_str in ['INVALID', '99/99/9999', '32/13/2026', '2026-13-45']:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_DATE',
                    'severity': 'CRITICAL',
                    'description': f"Invalid date value: {date_str}",
                    'column': column_name
                })
                return None
            
            # Try different date formats
            formats = [
                '%d-%m-%Y', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y',
                '%d/%m/%y', '%m/%d/%y', '%d-%m-%y', '%y-%m-%d'
            ]
            
            for fmt in formats:
                try:
                    return datetime.strptime(str(date_str), fmt)
                except:
                    continue
            
            # If all formats fail
            self.anomaly_log.append({
                'transaction_id': txn_id,
                'anomaly_type': 'UNPARSABLE_DATE',
                'severity': 'CRITICAL',
                'description': f"Cannot parse date: {date_str}",
                'column': column_name
            })
            return None
        
        # Apply date parsing with transaction_id
        df['posting_date'] = df.apply(
            lambda row: parse_date(row['posting_date_raw'], row['transaction_id'], 'posting_date_raw'), 
            axis=1
        )
        df['invoice_date'] = df.apply(
            lambda row: parse_date(row['invoice_date_raw'], row['transaction_id'], 'invoice_date_raw'), 
            axis=1
        )
        
        # Extract fiscal year and month
        df['fiscal_year'] = df['fiscal_period'].str[:4]
        df['fiscal_month'] = df['fiscal_period'].str[-2:]
        
        # Check fiscal period consistency
        for idx, row in df.iterrows():
            if pd.notna(row['posting_date']):
                posting_month = row['posting_date'].month
                fiscal_month = int(row['fiscal_month']) if pd.notna(row['fiscal_month']) else None
                
                if fiscal_month and posting_month != fiscal_month:
                    self.anomaly_log.append({
                        'transaction_id': row['transaction_id'],
                        'anomaly_type': 'FISCAL_PERIOD_MISMATCH',
                        'severity': 'HIGH',
                        'description': f"Posting date month ({posting_month}) != fiscal period month ({fiscal_month})",
                        'posting_date': row['posting_date'],
                        'fiscal_period': row['fiscal_period']
                    })
        
        print(f"   ‚úì Dates standardized. Invalid dates: {sum(df['posting_date'].isna())}")
        return self
    
    def clean_amounts(self):
        """Convert amount strings to floats"""
        df = self.standardized_df
        
        def parse_amount(amt_str, txn_id):
            if pd.isna(amt_str):
                return None
            
            # Remove currency symbols, commas, spaces
            cleaned = str(amt_str).replace('$', '').replace(',', '').strip()
            
            # Handle negative numbers in parentheses
            if cleaned.startswith('(') and cleaned.endswith(')'):
                cleaned = '-' + cleaned[1:-1]
            
            try:
                return float(cleaned)
            except:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_AMOUNT',
                    'severity': 'HIGH',
                    'description': f"Cannot parse amount: {amt_str}"
                })
                return None
        
        df['amount'] = df.apply(
            lambda row: parse_amount(row['amount_raw'], row['transaction_id']), 
            axis=1
        )
        
        # Flag negative amounts
        df['amount_is_negative'] = df['amount'] < 0
        for idx, row in df[df['amount_is_negative']].iterrows():
            self.anomaly_log.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'NEGATIVE_AMOUNT',
                'severity': 'MEDIUM',
                'description': f"Negative amount: {row['amount']}",
                'amount': row['amount']
            })
        
        print(f"   ‚úì Amounts cleaned. Negative amounts: {df['amount_is_negative'].sum()}")
        return self
    
    def detect_embedded_exceptions(self):
        """Look for obvious exceptions in raw data"""
        df = self.standardized_df
        keywords = ['error', 'flag', 'review', 'urgent', 'exception', 'invalid']
        
        df['narrative_lower'] = df['narrative'].str.lower().fillna('')
        
        for idx, row in df.iterrows():
            # Check narrative for keywords
            if any(keyword in str(row['narrative_lower']) for keyword in keywords):
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'NARRATIVE_SUGGESTS_EXCEPTION',
                    'severity': 'MEDIUM',
                    'description': f"Narrative contains exception keywords: {row['narrative']}",
                    'narrative': row['narrative']
                })
            
            # Check for placeholder vendor names
            if row['vendor_name_raw'] in ['Unlisted Company', 'Unknown Vendor LLC', 
                                           'New Vendor XYZ', 'Unregistered Supplier', 
                                           'Mystery Corp']:
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'PLACEHOLDER_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Placeholder vendor name: {row['vendor_name_raw']}",
                    'vendor': row['vendor_name_raw']
                })
        
        print(f"   ‚úì Embedded exceptions detected: {len([a for a in self.anomaly_log if a['anomaly_type'] == 'NARRATIVE_SUGGESTS_EXCEPTION'])}")
        return self
    
    def add_metadata(self):
        """Add processing metadata"""
        df = self.standardized_df
        df['processing_timestamp'] = datetime.now()
        df['source_file'] = 'Raw_GL_Export.csv'
        df['data_quality_score'] = 100 - (len(self.anomaly_log) / len(df) * 100) if len(df) > 0 else 100
        df['anomaly_count'] = df.apply(lambda row: len([a for a in self.anomaly_log 
                                                          if a.get('transaction_id') == row['transaction_id']]), axis=1)
        return self
    
    def save_output(self):
        """Save standardized data and anomaly log"""
        os.makedirs(Config.OUTPUT_PATH, exist_ok=True)
        os.makedirs(Config.REPORTS_PATH, exist_ok=True)
        
        # Save standardized data
        output_cols = ['transaction_id', 'posting_date_raw', 'posting_date', 'invoice_date_raw',
                       'invoice_date', 'fiscal_period', 'fiscal_year', 'fiscal_month',
                       'entity_code', 'account_code_raw', 'cost_center_raw', 'vendor_name_raw',
                       'invoice_number', 'po_number', 'currency_code', 'amount_raw', 'amount',
                       'amount_is_negative', 'tax_code', 'narrative', 'source_system',
                       'processing_timestamp', 'data_quality_score', 'anomaly_count']
        
        # Only include columns that exist
        available_cols = [col for col in output_cols if col in self.standardized_df.columns]
        self.standardized_df[available_cols].to_csv(
            f"{Config.OUTPUT_PATH}GL_Standardized.csv", index=False
        )
        
        # Save anomaly log
        if self.anomaly_log:
            pd.DataFrame(self.anomaly_log).to_csv(
                f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv", index=False
            )
        
        print(f"   üíæ Saved {len(self.standardized_df)} rows to {Config.OUTPUT_PATH}GL_Standardized.csv")
        print(f"   üíæ Saved {len(self.anomaly_log)} anomalies to {Config.REPORTS_PATH}Input_Anomalies_Detected.csv")
        
        return self.standardized_df, self.anomaly_log
    
    def run(self, filepath):
        """Execute all T001 steps"""
        print("\n" + "="*60)
        print("üöÄ T001: Wrangling Raw GL Data")
        print("="*60)
        
        self.load_raw_data(filepath)
        self.standardize_column_names()
        self.standardize_dates()
        self.clean_amounts()
        self.detect_embedded_exceptions()
        self.add_metadata()
        df, anomalies = self.save_output()
        
        print(f"\n‚úÖ T001 Complete. Processed {len(df)} rows, found {len(anomalies)} anomalies.")
        return df, anomalies


# ============================================================================
# T002: MAP ENTITIES AND ACCOUNTS (FIXED FOR YOUR COLUMN NAMES)
# ============================================================================

class T002_EntityAccountMapper:
    """Task 2: Resolve entity codes and account codes against master data"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.entity_master = None
        self.account_master = None
        self.cost_center_master = None
        self.mapping_anomalies = []
        
    def load_master_data(self):
        """Load master reference files"""
        print("\nüìÇ T002: Loading master data...")
        
        try:
            self.entity_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Entity.csv")
            print(f"   Loaded {len(self.entity_master)} entities")
            print(f"   Entity columns: {list(self.entity_master.columns)}")
        except:
            print("   ‚ö†Ô∏è Entity master not found, creating default")
            self.entity_master = pd.DataFrame({'entity_code': ['AUS01']})
        
        try:
            self.account_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_COA.csv")
            print(f"   Loaded {len(self.account_master)} accounts")
            print(f"   Account columns: {list(self.account_master.columns)}")
            
            # Standardize column names - convert to lowercase for easier matching
            self.account_master.columns = [col.lower().strip() for col in self.account_master.columns]
            
            # Map the account code column (which might be 'account_code' or 'account_code' after lowercasing)
            if 'account_code' not in self.account_master.columns:
                # Check for alternative names
                if 'account_code' in self.account_master.columns:
                    self.account_master.rename(columns={'account_code': 'account_code'}, inplace=True)
                elif 'account' in self.account_master.columns:
                    self.account_master.rename(columns={'account': 'account_code'}, inplace=True)
                elif 'code' in self.account_master.columns:
                    self.account_master.rename(columns={'code': 'account_code'}, inplace=True)
                else:
                    print(f"   ‚ö†Ô∏è Could not find account code column. Using first column as account_code")
                    first_col = self.account_master.columns[0]
                    self.account_master.rename(columns={first_col: 'account_code'}, inplace=True)
            
            print(f"   Using '{self.account_master.columns[0]}' as account code column")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Account master not found or error: {e}")
            print("   Creating default account master")
            self.account_master = pd.DataFrame({'account_code': [f"{i:04d}" for i in range(5000, 5029)]})
        
        try:
            self.cost_center_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv")
            print(f"   Loaded {len(self.cost_center_master)} cost centers")
            print(f"   Cost center columns: {list(self.cost_center_master.columns)}")
            
            # Standardize cost center column
            self.cost_center_master.columns = [col.lower().strip() for col in self.cost_center_master.columns]
            
            if 'cost_center' not in self.cost_center_master.columns:
                if 'costcenter' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'costcenter': 'cost_center'}, inplace=True)
                elif 'cc' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'cc': 'cost_center'}, inplace=True)
                else:
                    # Use first column as cost center
                    first_col = self.cost_center_master.columns[0]
                    self.cost_center_master.rename(columns={first_col: 'cost_center'}, inplace=True)
                    
        except Exception as e:
            print(f"   ‚ö†Ô∏è Cost center master not found or error: {e}")
            print("   Creating default cost center master")
            self.cost_center_master = pd.DataFrame({'cost_center': ['CC' + str(i).zfill(4) for i in range(1000, 1010)]})
        
        return self
    
    def map_entities(self):
        """Map entity codes against master"""
        # Handle entity master columns
        if 'entity_code' not in self.entity_master.columns:
            # Try to find entity code column
            for col in self.entity_master.columns:
                if 'entity' in col.lower() or 'code' in col.lower():
                    self.entity_master.rename(columns={col: 'entity_code'}, inplace=True)
                    break
        
        valid_entities = self.entity_master['entity_code'].tolist() if 'entity_code' in self.entity_master.columns else ['AUS01']
        
        self.df['entity_valid'] = self.df['entity_code'].isin(valid_entities)
        self.df['entity_code_mapped'] = np.where(
            self.df['entity_valid'], 
            self.df['entity_code'], 
            None
        )
        
        for idx, row in self.df[~self.df['entity_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ENTITY',
                'severity': 'CRITICAL',
                'description': f"Entity code '{row['entity_code']}' not in master",
                'original_value': row['entity_code']
            })
        
        print(f"   ‚úì Entities mapped. Invalid: {(~self.df['entity_valid']).sum()}")
        return self
    
    def map_accounts(self):
        """Map account codes against master with better matching"""
        
        # Get valid account codes from master
        if 'account_code' in self.account_master.columns:
            # Convert master account codes to strings and strip
            valid_accounts = [str(acct).strip() for acct in self.account_master['account_code'].tolist()]
            
            # Also try without leading/trailing spaces
            valid_accounts.extend([acct for acct in valid_accounts if acct != acct.strip()])
            valid_accounts = list(set(valid_accounts))  # Remove duplicates
            
            print(f"   Sample valid accounts: {valid_accounts[:5]}")
        else:
            print("   ‚ö†Ô∏è No account_code column found in master")
            valid_accounts = []
        
        # Clean raw account codes for comparison
        self.df['account_code_clean'] = self.df['account_code_raw'].astype(str).str.strip()
        
        # Try different matching strategies
        self.df['account_valid'] = False
        
        # Strategy 1: Direct match
        direct_match = self.df['account_code_raw'].isin(valid_accounts)
        self.df.loc[direct_match, 'account_valid'] = True
        
        # Strategy 2: Clean match
        clean_match = (~direct_match) & self.df['account_code_clean'].isin(valid_accounts)
        self.df.loc[clean_match, 'account_valid'] = True
        
        # Strategy 3: Numeric match (if both are numbers)
        if not self.df[~self.df['account_valid']].empty:
            # Convert valid accounts to numeric where possible
            numeric_valid = []
            for acct in valid_accounts:
                try:
                    numeric_valid.append(float(acct))
                except:
                    pass
            
            for idx, row in self.df[~self.df['account_valid']].iterrows():
                try:
                    raw_num = float(row['account_code_raw'])
                    if raw_num in numeric_valid:
                        self.df.at[idx, 'account_valid'] = True
                except:
                    pass
        
        # Assign mapped account codes
        def find_matching_account(row):
            if row['account_valid']:
                # Return the original if it's valid
                if row['account_code_raw'] in valid_accounts:
                    return row['account_code_raw']
                elif row['account_code_clean'] in valid_accounts:
                    return row['account_code_clean']
                else:
                    # Try to find numeric match
                    try:
                        raw_num = float(row['account_code_raw'])
                        for acct in valid_accounts:
                            try:
                                if float(acct) == raw_num:
                                    return acct
                            except:
                                continue
                    except:
                        pass
                    return row['account_code_raw']  # Return original if can't find better match
            return None
        
        self.df['account_code_mapped'] = self.df.apply(find_matching_account, axis=1)
        
        # Get account names/descriptions if available
        if 'account_name' in self.account_master.columns:
            # Create mapping dictionary
            account_desc_map = {}
            for _, row in self.account_master.iterrows():
                acct = str(row['account_code']).strip()
                desc = row['account_name']
                account_desc_map[acct] = desc
                # Also add without leading zeros
                if acct.isdigit():
                    account_desc_map[str(int(acct))] = desc
            
            self.df['account_description'] = self.df['account_code_mapped'].map(account_desc_map)
            print(f"   Added account descriptions")
        
        # Log anomalies for invalid accounts
        invalid_count = (~self.df['account_valid']).sum()
        for idx, row in self.df[~self.df['account_valid']].iterrows():
            severity = 'CRITICAL' if str(row['account_code_raw']) == 'INVALID_ACCT' else 'HIGH'
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ACCOUNT',
                'severity': severity,
                'description': f"Account code '{row['account_code_raw']}' not in Chart of Accounts",
                'original_value': row['account_code_raw'],
                'amount': row['amount']
            })
        
        print(f"   ‚úì Accounts mapped. Valid: {self.df['account_valid'].sum()}, Invalid: {invalid_count}")
        return self
    
    def map_cost_centers(self):
        """Map cost centers against master"""
        if 'cost_center' in self.cost_center_master.columns:
            valid_centers = self.cost_center_master['cost_center'].tolist()
        else:
            valid_centers = []
        
        # Handle missing cost centers
        self.df['cost_center_present'] = self.df['cost_center_raw'].notna() & (self.df['cost_center_raw'] != '')
        self.df['cost_center_valid'] = self.df['cost_center_raw'].isin(valid_centers) if valid_centers else self.df['cost_center_present']
        self.df['cost_center_mapped'] = np.where(
            self.df['cost_center_valid'],
            self.df['cost_center_raw'],
            None
        )
        
        for idx, row in self.df[~self.df['cost_center_present']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'MISSING_COST_CENTER',
                'severity': 'MEDIUM',
                'description': "Cost center is missing",
                'amount': row['amount']
            })
        
        for idx, row in self.df[self.df['cost_center_present'] & ~self.df['cost_center_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_COST_CENTER',
                'severity': 'HIGH',
                'description': f"Cost center '{row['cost_center_raw']}' not in master",
                'original_value': row['cost_center_raw']
            })
        
        print(f"   ‚úì Cost centers mapped. Missing: {(~self.df['cost_center_present']).sum()}, Invalid: {(self.df['cost_center_present'] & ~self.df['cost_center_valid']).sum()}")
        return self
    
    def save_output(self):
        """Save mapped data"""
        # Update anomaly log with new anomalies
        existing_anomalies = pd.read_csv(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") if os.path.exists(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") else pd.DataFrame()
        
        all_anomalies = pd.concat([
            existing_anomalies, 
            pd.DataFrame(self.mapping_anomalies)
        ], ignore_index=True)
        
        all_anomalies.to_csv(f"{Config.REPORTS_PATH}Exceptions_Log.csv", index=False)
        
        # Save enriched data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithMappings.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_WithMappings.csv")
        print(f"   üíæ Updated exceptions log with {len(self.mapping_anomalies)} new anomalies")
        
        return self.df
    
    def run(self):
        """Execute all T002 steps"""
        print("\n" + "="*60)
        print("üöÄ T002: Mapping Entities and Accounts")
        print("="*60)
        
        self.load_master_data()
        self.map_entities()
        self.map_accounts()
        self.map_cost_centers()
        df = self.save_output()
        
        print(f"\n‚úÖ T002 Complete. Mapped {len(df)} transactions.")
        return df


# ============================================================================
# T003: RESOLVE VENDOR NAMES (FIXED FOR YOUR COLUMN NAMES)
# ============================================================================

class T003_VendorResolver:
    """Task 3: Map vendor aliases to canonical vendor names"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.vendor_master = None
        self.alias_map = None
        self.vendor_anomalies = []
        
    def load_vendor_data(self):
        """Load vendor master and alias mapping"""
        print("\nüìÇ T003: Loading vendor data...")
        
        try:
            self.vendor_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Vendors.csv")
            print(f"   Loaded {len(self.vendor_master)} canonical vendors")
            print(f"   Vendor master columns: {list(self.vendor_master.columns)}")
            
            # Standardize column names
            self.vendor_master.columns = [col.lower().strip() for col in self.vendor_master.columns]
            
            # Map to expected column names
            if 'vendor_name_canonical' in self.vendor_master.columns:
                self.vendor_master.rename(columns={'vendor_name_canonical': 'canonical_vendor'}, inplace=True)
                print(f"   Using 'vendor_name_canonical' as canonical vendor column")
            elif 'vendor_name' in self.vendor_master.columns:
                self.vendor_master.rename(columns={'vendor_name': 'canonical_vendor'}, inplace=True)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Vendor master not found or error: {e}")
            print("   Creating default vendor master")
            self.vendor_master = pd.DataFrame({'canonical_vendor': ['Unknown']})
        
        try:
            self.alias_map = pd.read_csv(f"{Config.MASTER_DATA_PATH}Vendor_Alias_Map.csv")
            print(f"   Loaded {len(self.alias_map)} alias mappings")
            print(f"   Alias map columns: {list(self.alias_map.columns)}")
            
            # Standardize column names
            self.alias_map.columns = [col.lower().strip() for col in self.alias_map.columns]
            
            # Map to expected column names
            if 'vendor_name_raw' in self.alias_map.columns:
                self.alias_map.rename(columns={'vendor_name_raw': 'alias'}, inplace=True)
            
            if 'vendor_name_canonical' in self.alias_map.columns:
                self.alias_map.rename(columns={'vendor_name_canonical': 'canonical_vendor'}, inplace=True)
            
            print(f"   Alias map now has columns: {list(self.alias_map.columns)}")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Alias map not found or error: {e}")
            self.alias_map = pd.DataFrame({'alias': [], 'canonical_vendor': []})
        
        return self
    
    def build_alias_dict(self):
        """Create lookup dictionary from aliases to canonical names"""
        alias_dict = {}
        
        # Build from alias map
        if self.alias_map is not None and len(self.alias_map) > 0:
            # Check if required columns exist
            if 'alias' in self.alias_map.columns and 'canonical_vendor' in self.alias_map.columns:
                for _, row in self.alias_map.iterrows():
                    # Store multiple variations of the alias
                    alias_raw = str(row['alias']).strip()
                    alias_lower = alias_raw.lower()
                    alias_dict[alias_lower] = row['canonical_vendor']
                    
                    # Also store without common suffixes
                    for suffix in [' pty', ' ltd', ' inc', ' corp', ' llc', ' australia', ' usa', ' uk']:
                        if alias_lower.endswith(suffix):
                            alias_dict[alias_lower[:-len(suffix)]] = row['canonical_vendor']
                    
                    # Store first word for partial matching
                    first_word = alias_lower.split()[0] if alias_lower else ''
                    if first_word and len(first_word) > 3:
                        alias_dict[first_word] = row['canonical_vendor']
        
        # Add self-mappings for exact matches from vendor master
        if self.vendor_master is not None and 'canonical_vendor' in self.vendor_master.columns:
            for vendor in self.vendor_master['canonical_vendor'].dropna():
                vendor_lower = vendor.lower()
                alias_dict[vendor_lower] = vendor
                
                # Also store without common suffixes
                for suffix in [' pty', ' ltd', ' inc', ' corp', ' llc']:
                    if vendor_lower.endswith(suffix):
                        alias_dict[vendor_lower[:-len(suffix)]] = vendor
        
        print(f"   Built alias dictionary with {len(alias_dict)} entries")
        return alias_dict
    
    def resolve_vendors(self):
        """Apply vendor mapping with improved matching"""
        alias_dict = self.build_alias_dict()
        
        # Get list of canonical vendor names for fuzzy matching
        if 'canonical_vendor' in self.vendor_master.columns:
            canonical_list = self.vendor_master['canonical_vendor'].dropna().unique().tolist()
        else:
            canonical_list = []
        
        print(f"   Canonical vendor list has {len(canonical_list)} entries")
        
        def resolve(vendor_raw):
            if pd.isna(vendor_raw) or vendor_raw == '':
                return None, 'MISSING'
            
            vendor_original = str(vendor_raw).strip()
            vendor_lower = vendor_original.lower()
            
            # STRATEGY 1: Direct alias match
            if vendor_lower in alias_dict:
                return alias_dict[vendor_lower], 'MAPPED'
            
            # STRATEGY 2: Check if it's already a canonical name
            if vendor_original in canonical_list:
                return vendor_original, 'CANONICAL'
            
            # STRATEGY 3: Check cleaned version (remove special characters)
            import re
            vendor_clean = re.sub(r'[^\w\s]', '', vendor_lower)
            if vendor_clean in alias_dict:
                return alias_dict[vendor_clean], 'CLEANED_MATCH'
            
            # STRATEGY 4: Try partial matching (contains)
            for canonical in canonical_list:
                canonical_lower = canonical.lower()
                # Check if canonical name is contained in vendor name
                if canonical_lower in vendor_lower:
                    return canonical, 'PARTIAL_MATCH'
                # Check if vendor name is contained in canonical name
                if len(vendor_lower) > 5 and vendor_lower in canonical_lower:
                    return canonical, 'PARTIAL_MATCH'
            
            # STRATEGY 5: Try word-by-word matching
            vendor_words = set(vendor_lower.split())
            best_match = None
            best_match_score = 0
            
            for canonical in canonical_list:
                canonical_words = set(canonical.lower().split())
                # Calculate Jaccard similarity
                intersection = len(vendor_words.intersection(canonical_words))
                union = len(vendor_words.union(canonical_words))
                
                if union > 0:
                    score = intersection / union
                    if score > 0.5 and score > best_match_score:  # 50% word overlap
                        best_match = canonical
                        best_match_score = score
            
            if best_match:
                return best_match, f'WORD_MATCH_{best_match_score:.0%}'
            
            # No match found
            return None, 'UNMAPPED'
        
        # Apply resolution
        print("   Resolving vendors (this may take a moment)...")
        results = self.df['vendor_name_raw'].apply(resolve)
        self.df['vendor_canonical'] = [r[0] for r in results]
        self.df['vendor_resolution_status'] = [r[1] for r in results]
        
        # Log anomalies
        for idx, row in self.df.iterrows():
            if row['vendor_resolution_status'] == 'MISSING':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_VENDOR',
                    'severity': 'HIGH',
                    'description': 'Vendor name is missing',
                    'amount': row['amount']
                })
            elif row['vendor_resolution_status'] == 'UNMAPPED':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'UNMAPPED_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Vendor '{row['vendor_name_raw']}' not found in alias map",
                    'original_value': row['vendor_name_raw'],
                    'amount': row['amount']
                })
        
        # Calculate statistics
        mapped_count = self.df['vendor_resolution_status'].isin(['MAPPED', 'CANONICAL', 'CLEANED_MATCH', 'PARTIAL_MATCH']).sum()
        word_match_count = self.df['vendor_resolution_status'].str.contains('WORD_MATCH', na=False).sum()
        unmapped_count = (self.df['vendor_resolution_status'] == 'UNMAPPED').sum()
        missing_count = (self.df['vendor_resolution_status'] == 'MISSING').sum()
        
        print(f"\n   üìä Vendor Resolution Results:")
        print(f"   ‚Ä¢ Direct matches: {mapped_count}")
        print(f"   ‚Ä¢ Word matches: {word_match_count}")
        print(f"   ‚Ä¢ Unmapped: {unmapped_count}")
        print(f"   ‚Ä¢ Missing: {missing_count}")
        
        # Show sample of unmapped vendors for debugging
        if unmapped_count > 0:
            unmapped_samples = self.df[self.df['vendor_resolution_status'] == 'UNMAPPED']['vendor_name_raw'].dropna().unique()[:10]
            print(f"\n   Sample unmapped vendors: {list(unmapped_samples)}")
        
        return self
    
    def save_output(self):
        """Save vendor-resolved data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.vendor_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.vendor_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_VendorsResolved.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_VendorsResolved.csv")
        
        return self.df
    
    def run(self):
        """Execute all T003 steps"""
        print("\n" + "="*60)
        print("üöÄ T003: Resolving Vendor Names")
        print("="*60)
        
        self.load_vendor_data()
        self.resolve_vendors()
        df = self.save_output()
        
        print(f"\n‚úÖ T003 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T004: APPLY FX CONVERSION
# ============================================================================

class T004_FXConverter:
    """Task 4: Convert all transactions to AUD"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.fx_rates = None
        self.fx_anomalies = []
        
    def load_fx_rates(self):
        """Load foreign exchange rates"""
        print("\nüìÇ T004: Loading FX rates...")
        
        try:
            self.fx_rates = pd.read_csv(f"{Config.REFERENCE_PATH}FX_Rates.csv")
            print(f"   Loaded {len(self.fx_rates)} FX rates")
            
            # Ensure period is string for joining
            self.fx_rates['period'] = self.fx_rates['period'].astype(str)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è FX rates not found: {e}")
            # Create default rates (1.0 for all)
            periods = self.df['fiscal_period'].unique()
            currencies = self.df['currency_code'].unique()
            
            rates_data = []
            for period in periods:
                for currency in currencies:
                    if currency == 'AUD':
                        rate = 1.0
                    elif currency == 'USD':
                        rate = 1.5
                    elif currency == 'GBP':
                        rate = 1.9
                    elif currency == 'NZD':
                        rate = 0.95
                    elif currency == 'EUR':
                        rate = 1.6
                    else:
                        rate = None
                    
                    rates_data.append({
                        'period': period,
                        'currency': currency,
                        'rate': rate
                    })
            
            self.fx_rates = pd.DataFrame(rates_data)
            print(f"   Created default rates for {len(self.fx_rates)} currency-period combinations")
        
        return self
    
    def convert_to_aud(self):
        """Convert amounts to AUD"""
        
        # Create lookup key
        self.df['fx_key'] = self.df['fiscal_period'] + '_' + self.df['currency_code']
        self.fx_rates['fx_key'] = self.fx_rates['period'].astype(str) + '_' + self.fx_rates['currency']
        
        # Create rate lookup dictionary
        rate_dict = dict(zip(self.fx_rates['fx_key'], self.fx_rates['rate']))
        
        def get_rate(row):
            if row['currency_code'] == 'AUD':
                return 1.0
            
            key = row['fx_key']
            if key in rate_dict:
                return rate_dict[key]
            else:
                self.fx_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_FX_RATE',
                    'severity': 'CRITICAL',
                    'description': f"No FX rate found for {row['currency_code']} in period {row['fiscal_period']}",
                    'currency': row['currency_code'],
                    'period': row['fiscal_period'],
                    'amount': row['amount']
                })
                return None
        
        # Apply conversion
        self.df['fx_rate'] = self.df.apply(get_rate, axis=1)
        self.df['amount_aud'] = np.where(
            self.df['fx_rate'].notna(),
            self.df['amount'] * self.df['fx_rate'],
            None
        )
        
        # Flag conversion issues
        self.df['conversion_status'] = np.where(
            self.df['currency_code'] == 'AUD', 'DOMESTIC',
            np.where(self.df['fx_rate'].notna(), 'CONVERTED', 'FAILED')
        )
        
        converted = (self.df['conversion_status'] == 'CONVERTED').sum()
        failed = (self.df['conversion_status'] == 'FAILED').sum()
        domestic = (self.df['conversion_status'] == 'DOMESTIC').sum()
        
        print(f"   ‚úì FX conversion complete. Domestic: {domestic}, Converted: {converted}, Failed: {failed}")
        return self
    
    def save_output(self):
        """Save converted data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.fx_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.fx_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_Converted.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_Converted.csv")
        
        return self.df
    
    def run(self):
        """Execute all T004 steps"""
        print("\n" + "="*60)
        print("üöÄ T004: Applying FX Conversion")
        print("="*60)
        
        self.load_fx_rates()
        self.convert_to_aud()
        df = self.save_output()
        
        print(f"\n‚úÖ T004 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T005: DETECT EXCEPTIONS
# ============================================================================

class T005_ExceptionDetector:
    """Task 5: Run exception rules and flag violations"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.rulebook = None
        self.exception_results = []
        
    def load_rulebook(self):
        """Load exception rules"""
        print("\nüìÇ T005: Loading exception rulebook...")
        
        try:
            self.rulebook = pd.read_csv(f"{Config.REFERENCE_PATH}Exception_Rulebook.csv")
            print(f"   Loaded {len(self.rulebook)} exception rules")
            
            # Check if required columns exist, if not, create default rule IDs
            if 'rule_id' not in self.rulebook.columns:
                self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                print(f"   Added default rule_id column")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Rulebook not found or error loading: {e}")
            # Create default rules
            self.rulebook = pd.DataFrame([
                {'rule_id': 'EX001', 'rule_name': 'Missing PO Number', 
                 'severity': 'HIGH', 'logic': 'po_number is None or po_number == ""',
                 'description': 'Transaction has no purchase order number'},
                {'rule_id': 'EX002', 'rule_name': 'Missing Cost Center',
                 'severity': 'MEDIUM', 'logic': 'cost_center_mapped is None',
                 'description': 'Transaction has no cost center allocation'},
                {'rule_id': 'EX003', 'rule_name': 'Invalid Account',
                 'severity': 'CRITICAL', 'logic': 'account_code_mapped is None',
                 'description': 'Account code not in Chart of Accounts'},
                {'rule_id': 'EX004', 'rule_name': 'High Value Transaction',
                 'severity': 'MEDIUM', 'logic': f'amount_aud > {Config.HIGH_VALUE_THRESHOLD}',
                 'description': f'Transaction exceeds ${Config.HIGH_VALUE_THRESHOLD:,}'},
                {'rule_id': 'EX005', 'rule_name': 'Negative Amount',
                 'severity': 'MEDIUM', 'logic': 'amount_is_negative == True',
                 'description': 'Transaction has negative amount'},
                {'rule_id': 'EX006', 'rule_name': 'Unmapped Vendor',
                 'severity': 'HIGH', 'logic': 'vendor_resolution_status == "UNMAPPED"',
                 'description': 'Vendor not found in master data'},
                {'rule_id': 'EX007', 'rule_name': 'Future Dated Transaction',
                 'severity': 'HIGH', 'logic': 'posting_date > current_date and fiscal_period == current_period',
                 'description': 'Transaction date is in future but in current period'},
                {'rule_id': 'EX008', 'rule_name': 'Invalid Date',
                 'severity': 'CRITICAL', 'logic': 'posting_date is None',
                 'description': 'Posting date is invalid or missing'},
                {'rule_id': 'EX009', 'rule_name': 'Missing Tax Code',
                 'severity': 'MEDIUM', 'logic': 'tax_code is None or tax_code == ""',
                 'description': 'Tax code is missing'},
                {'rule_id': 'EX010', 'rule_name': 'Extreme Outlier',
                 'severity': 'MEDIUM', 'logic': 'is_outlier == True',
                 'description': 'Amount is significantly outside normal range'},
            ])
            print(f"   Created {len(self.rulebook)} default exception rules")
        
        # Ensure all required columns exist
        required_cols = ['rule_id', 'rule_name', 'severity', 'description']
        for col in required_cols:
            if col not in self.rulebook.columns:
                if col == 'rule_id':
                    self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                elif col == 'rule_name':
                    self.rulebook['rule_name'] = [f'Rule {i+1}' for i in range(len(self.rulebook))]
                elif col == 'severity':
                    self.rulebook['severity'] = 'MEDIUM'
                elif col == 'description':
                    self.rulebook['description'] = self.rulebook.get('rule_name', 'No description')
        
        print(f"   Ready with {len(self.rulebook)} rules")
        return self
    
    def detect_outliers(self):
        """Statistical outlier detection"""
        # Group by account to find normal ranges
        account_stats = self.df.groupby('account_code_mapped')['amount_aud'].agg(['mean', 'std', 'count']).reset_index()
        account_stats.columns = ['account_code_mapped', 'mean_amount', 'std_amount', 'txn_count']
        
        # Merge stats back
        self.df = self.df.merge(account_stats, on='account_code_mapped', how='left')
        
        # Flag outliers (beyond 3 standard deviations)
        self.df['is_outlier'] = np.where(
            (self.df['std_amount'] > 0) & 
            (self.df['amount_aud'].notna()) &
            (abs(self.df['amount_aud'] - self.df['mean_amount']) > Config.EXTREME_OUTLIER_MULTIPLIER * self.df['std_amount']),
            True,
            False
        )
        
        print(f"   ‚úì Outlier detection complete. Found {self.df['is_outlier'].sum()} outliers")
        return self
    
    def detect_temporal_anomalies(self):
        """Detect unusual timing patterns"""
        # Extract hour from posting date if available
        self.df['posting_hour'] = self.df['posting_date'].dt.hour
        self.df['posting_day'] = self.df['posting_date'].dt.day_name()
        self.df['posting_weekend'] = self.df['posting_date'].dt.dayofweek.isin([5, 6])
        
        # Flag suspicious hours (late night/early morning)
        self.df['suspicious_hour'] = (
            self.df['posting_hour'].notna() & 
            ((self.df['posting_hour'] >= Config.SUSPICIOUS_HOUR_START) | 
             (self.df['posting_hour'] <= Config.SUSPICIOUS_HOUR_END))
        )
        
        return self
    
    def apply_rules(self):
        """Apply all exception rules"""
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)  # Approx month end
        
        # Create a dictionary of rule logic functions
        rule_functions = {
            'EX001': lambda row: pd.isna(row['po_number']) or row['po_number'] == '',
            'EX002': lambda row: pd.isna(row['cost_center_mapped']),
            'EX003': lambda row: pd.isna(row['account_code_mapped']),
            'EX004': lambda row: row['amount_aud'] > Config.HIGH_VALUE_THRESHOLD if pd.notna(row['amount_aud']) else False,
            'EX005': lambda row: row.get('amount_is_negative', False),
            'EX006': lambda row: row.get('vendor_resolution_status') == 'UNMAPPED',
            'EX007': lambda row: (pd.notna(row['posting_date']) and 
                                  row['posting_date'] > current_date and 
                                  row['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD),
            'EX008': lambda row: pd.isna(row['posting_date']),
            'EX009': lambda row: pd.isna(row['tax_code']) or row['tax_code'] == '',
            'EX010': lambda row: row.get('is_outlier', False),
        }
        
        for _, rule in self.rulebook.iterrows():
            rule_id = rule['rule_id']
            rule_name = rule.get('rule_name', f'Rule {rule_id}')
            severity = rule.get('severity', 'MEDIUM')
            description = rule.get('description', rule_name)
            
            # Get the rule function
            rule_func = rule_functions.get(rule_id)
            if rule_func is None:
                # Skip rules we don't have logic for
                continue
            
            # Apply rule
            for idx, row in self.df.iterrows():
                try:
                    if rule_func(row):
                        self.exception_results.append({
                            'transaction_id': row['transaction_id'],
                            'rule_id': rule_id,
                            'rule_name': rule_name,
                            'severity': severity,
                            'description': description,
                            'amount': row.get('amount_aud', 0),
                            'vendor': row.get('vendor_name_raw', ''),
                            'account': row.get('account_code_raw', '')
                        })
                except Exception as e:
                    # Log rule application error but continue
                    print(f"   ‚ö†Ô∏è Error applying rule {rule_id} to transaction {row['transaction_id']}: {e}")
                    continue
        
        # Also add any existing anomalies from previous steps
        for idx, row in self.df.iterrows():
            if row.get('amount_is_negative', False):
                # Check if already added by rule EX005
                exists = any(e['transaction_id'] == row['transaction_id'] and e['rule_id'] == 'EX005' 
                            for e in self.exception_results)
                if not exists:
                    self.exception_results.append({
                        'transaction_id': row['transaction_id'],
                        'rule_id': 'EX005',
                        'rule_name': 'Negative Amount',
                        'severity': 'MEDIUM',
                        'description': 'Transaction has negative amount',
                        'amount': row.get('amount_aud', 0),
                        'vendor': row.get('vendor_name_raw', ''),
                        'account': row.get('account_code_raw', '')
                    })
        
        print(f"   ‚úì Applied rules, found {len(self.exception_results)} exceptions")
        return self
    
    def save_output(self):
        """Save exception results"""
        # Add exception flags to dataframe
        exception_txns = [e['transaction_id'] for e in self.exception_results]
        self.df['has_exception'] = self.df['transaction_id'].isin(exception_txns)
        
        # Group exceptions by transaction
        exception_summary = {}
        for e in self.exception_results:
            txn = e['transaction_id']
            if txn not in exception_summary:
                exception_summary[txn] = []
            exception_summary[txn].append(e['rule_id'])
        
        self.df['exception_rules'] = self.df['transaction_id'].map(
            lambda x: ';'.join(exception_summary.get(x, []))
        )
        
        # Save data with flags
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithExceptions.csv", index=False)
        
        # Save exception log
        if self.exception_results:
            exceptions_df = pd.DataFrame(self.exception_results)
            exceptions_df.to_csv(f"{Config.REPORTS_PATH}Exceptions_Detailed.csv", index=False)
        
        # Update master exceptions log
        master_exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        
        # Convert new exceptions to simple format
        new_exceptions = []
        for e in self.exception_results:
            new_exceptions.append({
                'transaction_id': e['transaction_id'],
                'anomaly_type': e['rule_id'],
                'severity': e['severity'],
                'description': e['description'],
                'amount': e.get('amount', 0)
            })
        
        if os.path.exists(master_exceptions_path):
            existing = pd.read_csv(master_exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(new_exceptions)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(new_exceptions)
        
        all_exceptions.to_csv(master_exceptions_path, index=False)
        
        print(f"   üíæ Saved exception data")
        
        return self.df, self.exception_results
    
    def run(self):
        """Execute all T005 steps"""
        print("\n" + "="*60)
        print("üöÄ T005: Detecting Exceptions")
        print("="*60)
        
        self.load_rulebook()
        self.detect_outliers()
        self.detect_temporal_anomalies()
        self.apply_rules()
        df, exceptions = self.save_output()
        
        # Severity counts
        if exceptions:
            severity_counts = {}
            for e in exceptions:
                sev = e.get('severity', 'UNKNOWN')
                severity_counts[sev] = severity_counts.get(sev, 0) + 1
            
            print(f"\n‚úÖ T005 Complete. Exceptions by severity:")
            for severity, count in severity_counts.items():
                print(f"   {severity}: {count}")
        else:
            print(f"\n‚úÖ T005 Complete. No exceptions found.")
        
        return df, exceptions

# ============================================================================
# T006: REVIEW HIGH SEVERITY EXCEPTIONS (Automated version - no human review)
# ============================================================================

class T006_ExceptionReviewer:
    """Task 6: Review and categorize exceptions (automated)"""
    
    def __init__(self, df, exceptions):
        self.df = df.copy()
        self.exceptions = exceptions
        self.critical_exceptions = []
        self.high_exceptions = []
        
    def categorize_exceptions(self):
        """Split exceptions by severity"""
        for e in self.exceptions:
            if e['severity'] == 'CRITICAL':
                self.critical_exceptions.append(e)
            elif e['severity'] == 'HIGH':
                self.high_exceptions.append(e)
        
        print(f"\nüìä T006: Exception Summary")
        print(f"   Critical: {len(self.critical_exceptions)}")
        print(f"   High: {len(self.high_exceptions)}")
        print(f"   Medium/Low: {len(self.exceptions) - len(self.critical_exceptions) - len(self.high_exceptions)}")
        
        return self
    
    def create_review_package(self):
        """Create automated review summary (no human pause)"""
        
        # Group critical exceptions by type
        critical_summary = {}
        for e in self.critical_exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in critical_summary:
                critical_summary[e_type] = {'count': 0, 'total_amount': 0, 'examples': []}
            
            critical_summary[e_type]['count'] += 1
            critical_summary[e_type]['total_amount'] += e.get('amount', 0)
            
            if len(critical_summary[e_type]['examples']) < 3:
                critical_summary[e_type]['examples'].append({
                    'transaction_id': e['transaction_id'],
                    'amount': e.get('amount', 0),
                    'description': e.get('description', '')
                })
        
        # Save review summary
        review_data = {
            'timestamp': datetime.now(),
            'total_critical': len(self.critical_exceptions),
            'total_high': len(self.high_exceptions),
            'critical_summary': critical_summary,
            'auto_approved': True,
            'note': 'Automated processing - no human review required'
        }
        
        # Save to file
        import json
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.json", 'w') as f:
            json.dump(review_data, f, indent=2, default=str)
        
        # Create a simple text summary
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.txt", 'w') as f:
            f.write("EXCEPTION REVIEW SUMMARY (Automated)\n")
            f.write("="*50 + "\n\n")
            f.write(f"Review Date: {datetime.now()}\n")
            f.write(f"Status: AUTO-APPROVED\n\n")
            
            f.write(f"CRITICAL EXCEPTIONS: {len(self.critical_exceptions)}\n")
            for e_type, data in critical_summary.items():
                f.write(f"  ‚Ä¢ {e_type}: {data['count']} occurrences, ${data['total_amount']:,.2f}\n")
            
            f.write(f"\nHIGH EXCEPTIONS: {len(self.high_exceptions)}\n")
        
        print(f"   üíæ Saved review summary to {Config.REPORTS_PATH}Exception_Review_Summary.txt")
        
        return review_data
    
    def run(self):
        """Execute T006 steps"""
        print("\n" + "="*60)
        print("üöÄ T006: Reviewing High Severity Exceptions")
        print("="*60)
        print("   ‚ö° Automated mode - no human review required")
        
        self.categorize_exceptions()
        review_data = self.create_review_package()
        
        print(f"\n‚úÖ T006 Complete. Proceeding with pipeline.")
        
        return self.df, review_data


# ============================================================================
# T007: COMPUTE BUDGET VARIANCE (FIXED DIVISION BY ZERO)
# ============================================================================

class T007_BudgetVariance:
    """Task 7: Calculate actual vs budget variance"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.budget_data = None
        self.variance_results = {}
        
    def load_budget(self):
        """Load budget data with proper column mapping"""
        print("\nüìÇ T007: Loading budget data...")
        
        try:
            self.budget_data = pd.read_csv(f"{Config.BUDGET_PATH}Budget_2026.csv")
            print(f"   Loaded budget data with {len(self.budget_data)} rows")
            
            # Standardize column names
            self.budget_data.columns = [col.lower().strip() for col in self.budget_data.columns]
            print(f"   Budget columns: {list(self.budget_data.columns)}")
            
            # Map period column
            period_col = None
            for col in ['fiscal_period', 'period', 'month', 'reporting_period']:
                if col in self.budget_data.columns:
                    period_col = col
                    break
            
            if period_col:
                self.budget_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                print(f"   ‚ö†Ô∏è No period column found, assuming all rows are for {Config.CURRENT_FISCAL_PERIOD}")
                self.budget_data['period'] = Config.CURRENT_FISCAL_PERIOD
            
            # Map account column
            account_col = None
            for col in ['account_code', 'account', 'gl_account', 'coa']:
                if col in self.budget_data.columns:
                    account_col = col
                    break
            
            if account_col:
                self.budget_data.rename(columns={account_col: 'account_code'}, inplace=True)
                print(f"   Using '{account_col}' as account column")
            
            # Map budget amount column
            budget_col = None
            for col in ['budget_amount_aud', 'budget_amount', 'budget', 'amount', 'planned_amount']:
                if col in self.budget_data.columns:
                    budget_col = col
                    break
            
            if budget_col:
                self.budget_data.rename(columns={budget_col: 'budget_amount'}, inplace=True)
                print(f"   Using '{budget_col}' as budget amount column")
                
                # Clean budget amounts (remove $, commas, etc.)
                self.budget_data['budget_amount'] = pd.to_numeric(
                    self.budget_data['budget_amount'].astype(str).str.replace('$', '').str.replace(',', ''),
                    errors='coerce'
                )
            else:
                print(f"   ‚ö†Ô∏è No budget amount column found, using synthetic data")
                self.budget_data['budget_amount'] = np.random.randint(50000, 200000, size=len(self.budget_data))
            
            # Ensure all key columns are string type for merging
            self.budget_data['period'] = self.budget_data['period'].astype(str)
            self.budget_data['account_code'] = self.budget_data['account_code'].astype(str)
            
            # Replace any zero or negative budget amounts with a small positive number to avoid division issues
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].replace(0, 0.01)
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].clip(lower=0.01)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Budget data not found or error loading: {e}")
            # Create sample budget
            accounts = self.df['account_code_mapped'].dropna().unique() if 'account_code_mapped' in self.df.columns else ['5000']
            
            budget_rows = []
            for account in accounts[:30]:
                budget_rows.append({
                    'account_code': str(account),
                    'period': Config.CURRENT_FISCAL_PERIOD,
                    'budget_amount': np.random.randint(50000, 200000)
                })
            
            self.budget_data = pd.DataFrame(budget_rows)
            print(f"   Created sample budget for {len(self.budget_data)} accounts")
        
        return self
    
    def calculate_variance(self):
        """Calculate variance by account, cost center, and overall"""
        
        # Filter to current period only
        current_period_df = self.df[
            (self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
            (self.df['amount_aud'].notna())
        ].copy()
        
        print(f"   Processing {len(current_period_df)} transactions for {Config.CURRENT_FISCAL_PERIOD}")
        
        # 1. Variance by Account
        account_actuals = current_period_df.groupby('account_code_mapped').agg({
            'amount_aud': 'sum',
            'transaction_id': 'count'
        }).rename(columns={
            'amount_aud': 'actual_amount',
            'transaction_id': 'transaction_count'
        }).reset_index()
        
        # Convert account codes to string for merging
        account_actuals['account_code_mapped'] = account_actuals['account_code_mapped'].astype(str)
        
        # Get budget for current period
        feb_budget = self.budget_data[self.budget_data['period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        if feb_budget.empty:
            print(f"   ‚ö†Ô∏è No budget found for period {Config.CURRENT_FISCAL_PERIOD}, using all budget data")
            feb_budget = self.budget_data.copy()
        
        # Ensure budget account codes are strings
        feb_budget['account_code'] = feb_budget['account_code'].astype(str)
        
        # Merge with budget
        if not account_actuals.empty and not feb_budget.empty:
            account_variance = pd.merge(
                account_actuals,
                feb_budget[['account_code', 'budget_amount']],
                left_on='account_code_mapped',
                right_on='account_code',
                how='outer'
            )
            
            account_variance['budget_amount'] = account_variance['budget_amount'].fillna(0.01)
            account_variance['actual_amount'] = account_variance['actual_amount'].fillna(0)
            account_variance['variance'] = account_variance['actual_amount'] - account_variance['budget_amount']
            
            # Safe variance percentage calculation (handle division by zero)
            def safe_variance_pct(row):
                if row['budget_amount'] > 0:
                    return (row['variance'] / row['budget_amount']) * 100
                elif row['actual_amount'] > 0:
                    # If budget is zero but there are actuals, it's infinite variance
                    return 999999  # Large number to indicate infinite
                else:
                    return 0
            
            account_variance['variance_pct'] = account_variance.apply(safe_variance_pct, axis=1)
            
            # Clean up columns
            account_variance = account_variance.drop(columns=['account_code'], errors='ignore')
            account_variance = account_variance.rename(columns={'account_code_mapped': 'account_code'})
        else:
            account_variance = pd.DataFrame()
        
        # 2. Variance by Cost Center
        if 'cost_center_mapped' in current_period_df.columns:
            cc_actuals = current_period_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).rename(columns={
                'amount_aud': 'actual_amount',
                'transaction_id': 'transaction_count'
            }).reset_index()
            
            cc_actuals = cc_actuals[cc_actuals['cost_center_mapped'].notna()]
        else:
            cc_actuals = pd.DataFrame()
        
        # 3. Suspense amounts (invalid accounts)
        suspense_amount = current_period_df[
            current_period_df['account_code_mapped'].isna()
        ]['amount_aud'].sum()
        
        # 4. Future dated amounts
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)
        future_amount = current_period_df[
            current_period_df['posting_date'] > current_date
        ]['amount_aud'].sum()
        
        # 5. Total actual and budget
        total_actual = current_period_df['amount_aud'].sum()
        total_budget = feb_budget['budget_amount'].sum() if not feb_budget.empty else 0.01
        
        # Safe total variance calculation
        total_variance = total_actual - total_budget
        if total_budget > 0:
            total_variance_pct = (total_variance / total_budget) * 100
        elif total_actual > 0:
            total_variance_pct = 999999  # Infinite variance
        else:
            total_variance_pct = 0
        
        # Store results
        self.variance_results = {
            'by_account': account_variance.to_dict('records') if not account_variance.empty else [],
            'by_cost_center': cc_actuals.to_dict('records') if not cc_actuals.empty else [],
            'suspense_amount': suspense_amount,
            'future_dated_amount': future_amount,
            'total_actual': total_actual,
            'total_budget': total_budget,
            'total_variance': total_variance,
            'total_variance_pct': total_variance_pct,
            'transaction_count': len(current_period_df),
            'exception_count': current_period_df['has_exception'].sum() if 'has_exception' in current_period_df.columns else 0
        }
        
        print(f"\n   Variance Summary:")
        print(f"   Total Actual: ${total_actual:,.2f}")
        print(f"   Total Budget: ${total_budget:,.2f}")
        print(f"   Variance: ${total_variance:,.2f} ({total_variance_pct:.1f}%)")
        print(f"   Suspense (invalid accounts): ${suspense_amount:,.2f}")
        print(f"   Future dated: ${future_amount:,.2f}")
        
        return self
    
    def save_output(self):
        """Save variance results"""
        
        # Save detailed variance by account
        if self.variance_results['by_account']:
            pd.DataFrame(self.variance_results['by_account']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_Account.csv", index=False
            )
        
        # Save variance by cost center
        if self.variance_results['by_cost_center']:
            pd.DataFrame(self.variance_results['by_cost_center']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_CostCenter.csv", index=False
            )
        
        # Save summary
        summary_df = pd.DataFrame([{
            'metric': 'Total Actual',
            'value': self.variance_results['total_actual']
        }, {
            'metric': 'Total Budget',
            'value': self.variance_results['total_budget']
        }, {
            'metric': 'Variance',
            'value': self.variance_results['total_variance']
        }, {
            'metric': 'Variance %',
            'value': self.variance_results['total_variance_pct']
        }, {
            'metric': 'Suspense Amount',
            'value': self.variance_results['suspense_amount']
        }, {
            'metric': 'Future Dated Amount',
            'value': self.variance_results['future_dated_amount']
        }, {
            'metric': 'Transaction Count',
            'value': self.variance_results['transaction_count']
        }, {
            'metric': 'Exception Count',
            'value': self.variance_results['exception_count']
        }])
        
        summary_df.to_csv(f"{Config.REPORTS_PATH}Budget_Variance_Summary.csv", index=False)
        
        print(f"   üíæ Saved variance reports to {Config.REPORTS_PATH}")
        
        return self.variance_results
    
    def run(self):
        """Execute T007 steps"""
        print("\n" + "="*60)
        print("üöÄ T007: Computing Budget Variance")
        print("="*60)
        
        self.load_budget()
        self.calculate_variance()
        results = self.save_output()
        
        print(f"\n‚úÖ T007 Complete.")
        
        return results


# ============================================================================
# T008: GENERATE CLOSE PACK REPORT
# ============================================================================

class T008_ClosePackReport:
    """Task 8: Create comprehensive month-end close report"""
    
    def __init__(self, df, variance_results, exceptions):
        self.df = df.copy()
        self.variance = variance_results
        self.exceptions = exceptions
        self.report_data = {}
        
    def generate_report(self):
        """Generate comprehensive close pack"""
        print("\nüìù T008: Generating Close Pack Report")
        
        # Filter to current period
        current_df = self.df[self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        # 1. Executive Summary
        self.report_data['executive_summary'] = {
            'period': Config.CURRENT_FISCAL_PERIOD,
            'generated_date': datetime.now(),
            'total_transactions': len(current_df),
            'total_spend': self.variance.get('total_actual', 0),
            'total_budget': self.variance.get('total_budget', 0),
            'variance': self.variance.get('total_variance', 0),
            'variance_pct': self.variance.get('total_variance_pct', 0),
            'exception_count': len(self.exceptions),
            'critical_exception_count': len([e for e in self.exceptions if e.get('severity') == 'CRITICAL']),
            'data_quality_score': current_df['data_quality_score'].iloc[0] if 'data_quality_score' in current_df.columns and len(current_df) > 0 else 85
        }
        
        # 2. Top exceptions
        exception_counts = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_counts:
                exception_counts[e_type] = {'count': 0, 'total_amount': 0}
            exception_counts[e_type]['count'] += 1
            exception_counts[e_type]['total_amount'] += e.get('amount', 0)
        
        self.report_data['top_exceptions'] = sorted(
            [{'type': k, **v} for k, v in exception_counts.items()],
            key=lambda x: x['total_amount'],
            reverse=True
        )[:10]
        
        # 3. Top vendors by spend - check if vendor_canonical exists
        if 'vendor_canonical' in current_df.columns:
            vendor_spend = current_df.groupby('vendor_canonical').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
        else:
            # Fallback to vendor_name_raw
            vendor_spend = current_df.groupby('vendor_name_raw').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False).head(20)
            vendor_spend.rename(columns={'vendor_name_raw': 'vendor_canonical'}, inplace=True)
        
        self.report_data['top_vendors'] = vendor_spend.to_dict('records')
        
        # 4. Account summary - FIX: Check if account_description exists
        if 'account_description' in current_df.columns:
            account_summary = current_df.groupby(['account_code_mapped', 'account_description']).agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            # Group by account code only
            account_summary = current_df.groupby('account_code_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
            # Add placeholder description
            account_summary['account_description'] = 'Unknown'
        
        self.report_data['account_summary'] = account_summary.to_dict('records')
        
        # 5. Cost center summary
        if 'cost_center_mapped' in current_df.columns:
            cc_summary = current_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            cc_summary = pd.DataFrame(columns=['cost_center_mapped', 'amount_aud', 'transaction_id'])
        
        self.report_data['cost_center_summary'] = cc_summary.to_dict('records')
        
        # 6. Currency exposure
        if 'currency_code' in current_df.columns and 'amount_aud' in current_df.columns:
            currency_summary = current_df.groupby('currency_code').agg({
                'amount': 'sum',
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index()
        else:
            currency_summary = pd.DataFrame(columns=['currency_code', 'amount', 'amount_aud', 'transaction_id'])
        
        self.report_data['currency_summary'] = currency_summary.to_dict('records')
        
        # 7. Source system breakdown
        if 'source_system' in current_df.columns:
            source_summary = current_df.groupby('source_system').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).reset_index().sort_values('amount_aud', ascending=False)
        else:
            source_summary = pd.DataFrame(columns=['source_system', 'amount_aud', 'transaction_id'])
        
        self.report_data['source_summary'] = source_summary.to_dict('records')
        
        print(f"   Generated report with {len(self.report_data)} sections")
        return self
    
    def save_report(self):
        """Save report in multiple formats"""
        
        # Save as CSV (tabular)
        pd.DataFrame([self.report_data['executive_summary']]).to_csv(
            f"{Config.REPORTS_PATH}Close_Pack_Executive_Summary.csv", index=False
        )
        
        if self.report_data['top_vendors']:
            pd.DataFrame(self.report_data['top_vendors']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Top_Vendors.csv", index=False
            )
        
        if self.report_data['account_summary']:
            pd.DataFrame(self.report_data['account_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Account_Summary.csv", index=False
            )
        
        if self.report_data['cost_center_summary']:
            pd.DataFrame(self.report_data['cost_center_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Cost_Center_Summary.csv", index=False
            )
        
        if self.report_data['currency_summary']:
            pd.DataFrame(self.report_data['currency_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Currency_Summary.csv", index=False
            )
        
        if self.report_data.get('source_summary'):
            pd.DataFrame(self.report_data['source_summary']).to_csv(
                f"{Config.REPORTS_PATH}Close_Pack_Source_Summary.csv", index=False
            )
        
        # Save as text report
        with open(f"{Config.REPORTS_PATH}MonthEnd_Close_Pack_Feb2026.txt", 'w') as f:
            f.write("="*80 + "\n")
            f.write(f"MONTH-END CLOSE PACK - {Config.CURRENT_FISCAL_PERIOD}\n")
            f.write("="*80 + "\n\n")
            
            # Executive Summary
            f.write("EXECUTIVE SUMMARY\n")
            f.write("-"*40 + "\n")
            f.write(f"Period: {self.report_data['executive_summary']['period']}\n")
            f.write(f"Generated: {self.report_data['executive_summary']['generated_date']}\n")
            f.write(f"Total Transactions: {self.report_data['executive_summary']['total_transactions']:,}\n")
            f.write(f"Total Spend: ${self.report_data['executive_summary']['total_spend']:,.2f}\n")
            f.write(f"Total Budget: ${self.report_data['executive_summary']['total_budget']:,.2f}\n")
            f.write(f"Variance: ${self.report_data['executive_summary']['variance']:,.2f} ")
            f.write(f"({self.report_data['executive_summary']['variance_pct']:.1f}%)\n")
            f.write(f"Data Quality Score: {self.report_data['executive_summary']['data_quality_score']:.1f}/100\n\n")
            
            # Top Exceptions
            f.write("TOP EXCEPTIONS BY VALUE\n")
            f.write("-"*40 + "\n")
            for e in self.report_data['top_exceptions'][:5]:
                f.write(f"‚Ä¢ {e['type']}: {e['count']} occurrences, ${e['total_amount']:,.2f}\n")
            f.write("\n")
            
            # Top Vendors
            f.write("TOP 10 VENDORS\n")
            f.write("-"*40 + "\n")
            for v in self.report_data['top_vendors'][:10]:
                vendor_name = v.get('vendor_canonical', v.get('vendor_name_raw', 'Unknown'))
                f.write(f"‚Ä¢ {vendor_name}: ${v['amount_aud']:,.2f} ({v['transaction_id']} txns)\n")
            f.write("\n")
            
            # Currency Exposure
            f.write("CURRENCY EXPOSURE\n")
            f.write("-"*40 + "\n")
            for c in self.report_data['currency_summary']:
                f.write(f"‚Ä¢ {c['currency_code']}: {c['transaction_id']} txns, ")
                f.write(f"Original: ${c.get('amount', 0):,.2f}, AUD: ${c['amount_aud']:,.2f}\n")
            
            # Source Systems
            if self.report_data.get('source_summary'):
                f.write("\nSOURCE SYSTEMS\n")
                f.write("-"*40 + "\n")
                for s in self.report_data['source_summary'][:5]:
                    f.write(f"‚Ä¢ {s['source_system']}: ${s['amount_aud']:,.2f} ({s['transaction_id']} txns)\n")
        
        print(f"   üíæ Saved reports to {Config.REPORTS_PATH}")
        
        return self.report_data
    
    def run(self):
        """Execute T008 steps"""
        print("\n" + "="*60)
        print("üöÄ T008: Generating Close Pack Report")
        print("="*60)
        
        self.generate_report()
        report = self.save_report()
        
        print(f"\n‚úÖ T008 Complete. Report saved.")
        
        return report


# ============================================================================
# T009: GENERATE EXECUTIVE NARRATIVE (Rule-based, no LLM)
# ============================================================================

class T009_ExecutiveNarrative:
    """Task 9: Create natural language summary (rule-based, no LLM)"""
    
    def __init__(self, variance_results, report_data, exceptions):
        self.variance = variance_results
        self.report = report_data
        self.exceptions = exceptions
        self.narrative = ""
        
    def generate_narrative(self):
        """Generate narrative using templates and rules"""
        print("\nüìù T009: Generating Executive Narrative")
        
        lines = []
        
        # Header
        lines.append("="*80)
        lines.append(f"EXECUTIVE NARRATIVE - {Config.CURRENT_FISCAL_PERIOD}")
        lines.append("="*80)
        lines.append("")
        
        # Financial Summary
        lines.append("FINANCIAL SUMMARY")
        lines.append("-"*40)
        
        variance_pct = self.variance['total_variance_pct']
        if abs(variance_pct) < 2:
            variance_desc = "in line with"
        elif variance_pct > 0:
            if variance_pct > 10:
                variance_desc = "significantly above"
            else:
                variance_desc = "moderately above"
        else:
            if variance_pct < -10:
                variance_desc = "significantly below"
            else:
                variance_desc = "moderately below"
        
        lines.append(f"Total spend for {Config.CURRENT_FISCAL_PERIOD} was ${self.variance['total_actual']:,.2f}, "
                    f"which is {variance_desc} budget of ${self.variance['total_budget']:,.2f}. "
                    f"The variance is ${abs(self.variance['total_variance']):,.2f} ({variance_pct:.1f}%).")
        lines.append("")
        
        # Key Drivers
        lines.append("KEY VARIANCE DRIVERS")
        lines.append("-"*40)
        
        # Find largest variances from account data
        account_variances = self.variance['by_account']
        top_pos = sorted([a for a in account_variances if a.get('variance', 0) > 0], 
                         key=lambda x: x['variance'], reverse=True)[:3]
        top_neg = sorted([a for a in account_variances if a.get('variance', 0) < 0], 
                         key=lambda x: x['variance'])[:3]
        
        if top_pos:
            lines.append("Positive variances (over budget):")
            for a in top_pos:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: +${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        
        if top_neg:
            lines.append("Negative variances (under budget):")
            for a in top_neg:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: ${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        lines.append("")
        
        # Exception Summary
        lines.append("EXCEPTION SUMMARY")
        lines.append("-"*40)
        
        critical_count = len([e for e in self.exceptions if e.get('severity') == 'CRITICAL'])
        high_count = len([e for e in self.exceptions if e.get('severity') == 'HIGH'])
        medium_count = len([e for e in self.exceptions if e.get('severity') == 'MEDIUM'])
        
        lines.append(f"Total exceptions: {len(self.exceptions)}")
        lines.append(f"  ‚Ä¢ Critical: {critical_count}")
        lines.append(f"  ‚Ä¢ High: {high_count}")
        lines.append(f"  ‚Ä¢ Medium: {medium_count}")
        
        # Top exception types
        exception_types = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_types:
                exception_types[e_type] = 0
            exception_types[e_type] += 1
        
        top_types = sorted(exception_types.items(), key=lambda x: x[1], reverse=True)[:3]
        if top_types:
            lines.append("\nMost common exceptions:")
            for e_type, count in top_types:
                lines.append(f"  ‚Ä¢ {e_type}: {count} occurrences")
        lines.append("")
        
        # Data Quality Impact
        lines.append("DATA QUALITY IMPACT")
        lines.append("-"*40)
        
        suspense_amount = self.variance.get('suspense_amount', 0)
        future_amount = self.variance.get('future_dated_amount', 0)
        total_impact = suspense_amount + future_amount
        impact_pct = (total_impact / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Transactions with data quality issues: ${total_impact:,.2f} ({impact_pct:.1f}% of total)")
        if suspense_amount > 0:
            lines.append(f"  ‚Ä¢ Invalid accounts (in suspense): ${suspense_amount:,.2f}")
        if future_amount > 0:
            lines.append(f"  ‚Ä¢ Future-dated transactions: ${future_amount:,.2f}")
        lines.append("")
        
        # Currency Impact
        lines.append("CURRENCY EXPOSURE")
        lines.append("-"*40)
        
        non_aud_total = sum(c['amount_aud'] for c in self.report['currency_summary'] 
                           if c['currency_code'] != 'AUD')
        non_aud_pct = (non_aud_total / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Foreign currency exposure: ${non_aud_total:,.2f} ({non_aud_pct:.1f}% of total)")
        
        # Top non-AUD currencies
        for c in self.report['currency_summary']:
            if c['currency_code'] != 'AUD' and c['amount_aud'] > 0:
                lines.append(f"  ‚Ä¢ {c['currency_code']}: ${c['amount_aud']:,.2f}")
        lines.append("")
        
        # Recommendations
        lines.append("RECOMMENDATIONS")
        lines.append("-"*40)
        
        if suspense_amount > 10000:
            lines.append("‚Ä¢ Review and remap transactions with invalid account codes")
        if future_amount > 10000:
            lines.append("‚Ä¢ Reclassify future-dated transactions to correct period")
        if critical_count > 0:
            lines.append("‚Ä¢ Investigate critical exceptions before next close")
        if len(self.exceptions) > 100:
            lines.append("‚Ä¢ Schedule data quality workshop to address root causes")
        
        # Join all lines
        self.narrative = "\n".join(lines)
        
        print(f"   Generated {len(lines)} lines of narrative")
        return self
    
    def save_narrative(self):
        """Save narrative to file"""
        with open(f"{Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt", 'w') as f:
            f.write(self.narrative)
        
        print(f"   üíæ Saved narrative to {Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt")
        
        return self.narrative
    
    def run(self):
        """Execute T009 steps"""
        print("\n" + "="*60)
        print("üöÄ T009: Generating Executive Narrative")
        print("="*60)
        
        self.generate_narrative()
        narrative = self.save_narrative()
        
        print(f"\n‚úÖ T009 Complete.")
        
        return narrative


# ============================================================================
# T010: FORECAST NEXT PERIOD
# ============================================================================

# ============================================================================
# T010: FORECAST NEXT PERIOD (FIXED)
# ============================================================================

class T010_Forecast:
    """Task 10: Generate forecast for next period based on historical trends"""
    
    def __init__(self, df, variance_results):
        self.df = df
        self.variance = variance_results
        self.historical_data = None
        self.forecast = {}
        
    def load_historical(self):
        """Load historical KPI data"""
        print("\nüìÇ T010: Loading historical data...")
        
        try:
            self.historical_data = pd.read_csv(f"{Config.REFERENCE_PATH}KPI_Monthly_History.csv")
            print(f"   Loaded {len(self.historical_data)} rows of historical data")
            
            # Standardize column names
            self.historical_data.columns = [col.lower().strip() for col in self.historical_data.columns]
            
            # Check for period column and rename if needed
            period_col = None
            for col in ['period', 'month', 'fiscal_period', 'reporting_period', 'date', 'year_month']:
                if col in self.historical_data.columns:
                    period_col = col
                    break
            
            if period_col:
                if period_col != 'period':
                    self.historical_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                # Create a synthetic period column if none exists
                print(f"   ‚ö†Ô∏è No period column found, creating synthetic periods")
                self.historical_data['period'] = [f"2025-{i:02d}" for i in range(1, len(self.historical_data) + 1)]
            
            # Check for spend column and rename if needed
            spend_col = None
            for col in ['total_spend', 'spend', 'amount', 'actual', 'value', 'total']:
                if col in self.historical_data.columns:
                    spend_col = col
                    break
            
            if spend_col:
                if spend_col != 'total_spend':
                    self.historical_data.rename(columns={spend_col: 'total_spend'}, inplace=True)
                print(f"   Using '{spend_col}' as spend column")
            else:
                # Create synthetic spend data
                print(f"   ‚ö†Ô∏è No spend column found, creating synthetic data")
                base_spend = self.variance.get('total_actual', 1000000)
                self.historical_data['total_spend'] = [
                    base_spend * (0.8 + 0.4 * np.random.random()) 
                    for _ in range(len(self.historical_data))
                ]
            
            print(f"   Historical data columns: {list(self.historical_data.columns)}")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Historical data not found or error loading: {e}")
            # Create synthetic history from current data
            months = []
            base_spend = self.variance.get('total_actual', 1000000)
            base_count = self.variance.get('transaction_count', 1000)
            
            for i in range(1, 13):
                month_num = Config.CURRENT_MONTH - (12 - i)
                year = Config.CURRENT_YEAR
                if month_num <= 0:
                    month_num += 12
                    year -= 1
                
                month = f"{year}-{month_num:02d}"
                months.append({
                    'period': month,
                    'total_spend': base_spend * (0.8 + 0.4 * np.random.random()),
                    'transaction_count': int(base_count * (0.8 + 0.4 * np.random.random()))
                })
            self.historical_data = pd.DataFrame(months)
            print(f"   Created synthetic historical data for {len(self.historical_data)} months")
        
        # Ensure period is string type for sorting
        self.historical_data['period'] = self.historical_data['period'].astype(str)
        
        return self
    
    def calculate_trends(self):
        """Calculate trends from historical data"""
        
        # Sort by period
        try:
            self.historical_data = self.historical_data.sort_values('period')
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error sorting by period: {e}")
            # If sorting fails, assume data is already in order
            pass
        
        # Calculate moving averages
        if len(self.historical_data) >= 3:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend'].rolling(3, min_periods=1).mean()
        else:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend']
        
        # Calculate growth rate
        if len(self.historical_data) >= 2:
            self.historical_data['growth_rate'] = self.historical_data['total_spend'].pct_change()
            avg_growth = self.historical_data['growth_rate'].mean()
            # Handle NaN
            if pd.isna(avg_growth):
                avg_growth = 0.02
        else:
            avg_growth = 0.02  # Default 2% growth
        
        # Recent trend (last 3 months)
        recent_data = self.historical_data.tail(min(3, len(self.historical_data)))
        recent_avg = recent_data['total_spend'].mean()
        
        if len(recent_data) >= 2:
            recent_growth = recent_data['growth_rate'].mean()
        else:
            recent_growth = avg_growth
        
        # Seasonal adjustment (if we have same month last year)
        current_month_str = f"{Config.CURRENT_MONTH:02d}"
        last_year_data = self.historical_data[
            self.historical_data['period'].str.endswith(current_month_str)
        ]
        
        if not last_year_data.empty and recent_avg > 0:
            seasonal_factor = last_year_data['total_spend'].iloc[0] / recent_avg
        else:
            seasonal_factor = 1.0
        
        # Calculate forecast for next period
        if Config.CURRENT_MONTH < 12:
            next_period = f"{Config.CURRENT_YEAR}-{Config.CURRENT_MONTH+1:02d}"
            next_month_num = Config.CURRENT_MONTH + 1
            next_year = Config.CURRENT_YEAR
        else:
            next_period = f"{Config.CURRENT_YEAR+1}-01"
            next_month_num = 1
            next_year = Config.CURRENT_YEAR + 1
        
        # Base forecast on recent average with growth and seasonal adjustment
        base_forecast = recent_avg * (1 + recent_growth) * seasonal_factor
        
        # Adjust based on current month actual
        current_actual = self.variance.get('total_actual', base_forecast)
        recent_avg = recent_avg if recent_avg > 0 else current_actual
        
        # Blend current and historical (70% recent trend, 30% current month with growth)
        blended_forecast = 0.7 * base_forecast + 0.3 * current_actual * 1.05  # Assume 5% growth
        
        # Calculate confidence interval
        if len(self.historical_data) > 1:
            std_dev = self.historical_data['total_spend'].std()
            margin = 1.96 * std_dev / np.sqrt(len(self.historical_data))
        else:
            std_dev = blended_forecast * 0.1
            margin = blended_forecast * 0.2
        
        lower_bound = max(0, blended_forecast - margin)
        upper_bound = blended_forecast + margin
        
        self.forecast = {
            'next_period': next_period,
            'next_month': next_month_num,
            'next_year': next_year,
            'forecast_amount': blended_forecast,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'confidence_level': 0.95,
            'method': 'Blended (70% trend, 30% current)',
            'historical_months_used': len(self.historical_data),
            'avg_growth_rate': avg_growth,
            'seasonal_factor': seasonal_factor,
            'current_actual': current_actual,
            'recent_avg': recent_avg
        }
        
        print(f"\n   Forecast for {next_period}:")
        print(f"   Point forecast: ${self.forecast['forecast_amount']:,.2f}")
        print(f"   95% CI: (${self.forecast['lower_bound']:,.2f} - ${self.forecast['upper_bound']:,.2f})")
        
        return self
    
    def save_forecast(self):
        """Save forecast results"""
        
        # Save as CSV
        forecast_df = pd.DataFrame([self.forecast])
        forecast_df.to_csv(f"{Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv", index=False)
        
        # Save detailed forecast with account-level breakdown
        if 'by_account' in self.variance and self.variance['by_account']:
            account_proportions = []
            total_actual = self.variance.get('total_actual', 0)
            
            if total_actual > 0:
                for a in self.variance['by_account']:
                    if a.get('actual_amount', 0) > 0:
                        proportion = a['actual_amount'] / total_actual
                        account_proportions.append({
                            'account_code': a.get('account_code_mapped', a.get('account_code', 'UNKNOWN')),
                            'account_description': a.get('account_description', 'Unknown'),
                            'current_actual': a['actual_amount'],
                            'forecast_proportion': proportion,
                            'forecast_amount': proportion * self.forecast['forecast_amount']
                        })
                
                if account_proportions:
                    pd.DataFrame(account_proportions).to_csv(
                        f"{Config.REPORTS_PATH}Forecast_By_Account_{self.forecast['next_period'].replace('-', '')}.csv", 
                        index=False
                    )
        
        print(f"   üíæ Saved forecast to {Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv")
        
        return self.forecast
    
    def run(self):
        """Execute T010 steps"""
        print("\n" + "="*60)
        print("üöÄ T010: Forecasting Next Period")
        print("="*60)
        
        self.load_historical()
        self.calculate_trends()
        forecast = self.save_forecast()
        
        print(f"\n‚úÖ T010 Complete.")
        
        return forecast
    

# Add this class before the main pipeline

# ============================================================================
# IMPROVED DATA VALIDATOR (FIXED MESSAGE)
# ============================================================================

class DataValidator:
    """Validate that all required data files exist and are properly formatted"""
    
    @staticmethod
    def validate_all():
        """Run all validations"""
        issues = []
        
        # Check master data files
        required_files = {
            f"{Config.MASTER_DATA_PATH}Master_COA.csv": "Chart of Accounts",
            f"{Config.MASTER_DATA_PATH}Master_Entity.csv": "Entity Master",
            f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv": "Cost Center Master",
            f"{Config.BUDGET_PATH}Budget_2026.csv": "Budget Data"
        }
        
        print("\nüìä DATA VALIDATION")
        print("-" * 40)
        
        for filepath, description in required_files.items():
            if not os.path.exists(filepath):
                issues.append(f"‚ùå Missing {description}: {filepath}")
            else:
                try:
                    df = pd.read_csv(filepath)
                    print(f"‚úÖ {description}: {len(df)} rows")
                    print(f"   Columns: {list(df.columns)}")
                    
                    # Special checks for Master_COA.csv
                    if "Master_COA.csv" in filepath:
                        # Check for account code column variations
                        possible_cols = ['Account_Code', 'account_code', 'AccountCode', 'Account', 'CODE']
                        found_col = None
                        for col in possible_cols:
                            if col in df.columns:
                                print(f"   ‚úì Found account code column: '{col}'")
                                found_col = col
                                break
                        if not found_col:
                            issues.append(f"   ‚ùå No account code column found in {filepath}. Found: {list(df.columns)}")
                            
                except Exception as e:
                    issues.append(f"‚ùå Cannot read {description}: {e}")
        
        if issues:
            print("\n‚ö†Ô∏è DATA VALIDATION ISSUES FOUND:")
            for issue in issues:
                print(issue)
            print("\n‚úÖ Pipeline will continue but may use synthetic data where needed.\n")
            return False
        else:
            print("\n‚úÖ All master data files validated successfully.\n")
            return True


# ============================================================================
# MAIN PIPELINE EXECUTION (WITH BUDGET ANALYSIS)
# ============================================================================

class FinancialCloseAgent:
    """Main agent orchestrating all tasks"""
    
    def __init__(self):
        self.results = {}
        self.start_time = datetime.now()
        
    def run_pipeline(self):
        """Execute all tasks in sequence"""
        print("\n" + "="*80)
        print("üöÄ FINANCIAL CLOSE AGENT PIPELINE")
        print(f"   Started: {self.start_time}")
        print("="*80 + "\n")

        # Validate data files
        validator = DataValidator()
        validator.validate_all()
        
        # Task 001: Wrangle Raw Data
        wrangler = T001_DataWrangler()
        df, anomalies = wrangler.run(Config.RAW_DATA_PATH)
        self.results['df_t001'] = df
        self.results['anomalies'] = anomalies
        
        # Task 002: Map Entities and Accounts
        mapper = T002_EntityAccountMapper(df)
        df = mapper.run()
        self.results['df_t002'] = df
        
        # Task 003: Resolve Vendors
        resolver = T003_VendorResolver(df)
        df = resolver.run()
        self.results['df_t003'] = df
        
        # Task 004: FX Conversion
        converter = T004_FXConverter(df)
        df = converter.run()
        self.results['df_t004'] = df
        
        # Task 005: Detect Exceptions
        detector = T005_ExceptionDetector(df)
        df, exceptions = detector.run()
        self.results['df_t005'] = df
        self.results['exceptions'] = exceptions
        
        # Task 006: Review Exceptions (Automated)
        reviewer = T006_ExceptionReviewer(df, exceptions)
        df, review = reviewer.run()
        self.results['df_t006'] = df
        self.results['review'] = review
        
        # Task 007: Budget Variance
        variance = T007_BudgetVariance(df)
        variance_results = variance.run()
        self.results['variance'] = variance_results
        self.results['budget_data'] = variance.budget_data  # Store budget data for analysis
        
        # Add budget coverage analysis
        self.analyze_budget_coverage(df, variance.budget_data)
        
        # Task 008: Close Pack Report
        report = T008_ClosePackReport(df, variance_results, exceptions)
        report_data = report.run()
        self.results['report'] = report_data
        
        # Task 009: Executive Narrative
        narrative = T009_ExecutiveNarrative(variance_results, report_data, exceptions)
        narrative_text = narrative.run()
        self.results['narrative'] = narrative_text
        
        # Task 010: Forecast
        forecast = T010_Forecast(df, variance_results)
        forecast_data = forecast.run()
        self.results['forecast'] = forecast_data
        
        # Completion
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()
        
        print("\n" + "="*80)
        print("‚úÖ PIPELINE COMPLETE")
        print(f"   Finished: {end_time}")
        print(f"   Duration: {duration:.2f} seconds")
        print("="*80)
        
        return self.results
    
    def analyze_budget_coverage(self, df, budget_data):
        """Analyze budget coverage and identify gaps"""
        print("\n" + "="*60)
        print("üìä BUDGET COVERAGE ANALYSIS")
        print("="*60)
        
        if budget_data is None or budget_data.empty:
            print("‚ö†Ô∏è No budget data available for analysis")
            return
        
        # Get unique accounts with activity in current period
        active_accounts = df[
            (df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) & 
            (df['account_code_mapped'].notna())
        ]['account_code_mapped'].unique()
        
        print(f"Active accounts in {Config.CURRENT_FISCAL_PERIOD}: {len(active_accounts)}")
        
        # Get accounts with budget in current period
        budget_accounts = budget_data[
            budget_data['period'] == Config.CURRENT_FISCAL_PERIOD
        ]['account_code'].unique()
        
        print(f"Accounts with budget: {len(budget_accounts)}")
        
        # Find accounts missing budget
        missing_budget = set(active_accounts) - set(budget_accounts)
        if missing_budget:
            print(f"\n‚ö†Ô∏è {len(missing_budget)} active accounts have no budget:")
            # Show sample of missing accounts
            sample_missing = list(missing_budget)[:10]
            print(f"   Sample: {sample_missing}")
            
            # Calculate total spend in missing budget accounts
            missing_spend = df[
                (df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
                (df['account_code_mapped'].isin(missing_budget))
            ]['amount_aud'].sum()
            
            print(f"   Total spend in unbudgeted accounts: ${missing_spend:,.2f}")
            print(f"   This represents {missing_spend/self.results['variance']['total_actual']*100:.1f}% of total spend")
        else:
            print("\n‚úÖ All active accounts have budget assigned")
        
        # Find budgeted accounts with no activity
        inactive_budget = set(budget_accounts) - set(active_accounts)
        if inactive_budget:
            print(f"\n‚ÑπÔ∏è {len(inactive_budget)} budgeted accounts have no activity:")
            sample_inactive = list(inactive_budget)[:10]
            print(f"   Sample: {sample_inactive}")
        
        print("\n" + "="*60)


# ============================================================================
# EXECUTE THE PIPELINE
# ============================================================================

if __name__ == "__main__":
    # Create directories if they don't exist
    for path in [Config.OUTPUT_PATH, Config.REPORTS_PATH]:
        os.makedirs(path, exist_ok=True)
    
    # Run the agent
    agent = FinancialCloseAgent()
    results = agent.run_pipeline()
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä FINAL SUMMARY")
    print("="*80)
    print(f"Total transactions processed: {len(results['df_t001'])}")
    print(f"Total exceptions found: {len(results['exceptions'])}")
    print(f"Critical exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'CRITICAL'])}")
    print(f"High exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'HIGH'])}")
    print(f"Total spend: ${results['variance']['total_actual']:,.2f}")
    print(f"Budget variance: ${results['variance']['total_variance']:,.2f} ({results['variance']['total_variance_pct']:.1f}%)")
    print(f"Suspense amount (invalid accounts): ${results['variance']['suspense_amount']:,.2f}")
    print(f"Forecast for next period: ${results['forecast']['forecast_amount']:,.2f}")
    
    # Add budget coverage summary to final output
    if 'budget_data' in results and results['budget_data'] is not None:
        budget_data = results['budget_data']
        df = results['df_t006']
        
        active_accounts = df[df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD]['account_code_mapped'].dropna().nunique()
        budget_accounts = budget_data[budget_data['period'] == Config.CURRENT_FISCAL_PERIOD]['account_code'].nunique()
        
        print(f"\nüìä BUDGET COVERAGE:")
        print(f"   Active accounts with budget: {len(set(
            df[df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD]['account_code_mapped'].dropna().unique()
        ) & set(
            budget_data[budget_data['period'] == Config.CURRENT_FISCAL_PERIOD]['account_code'].unique()
        ))}")
        print(f"   Active accounts without budget: {active_accounts - budget_accounts if active_accounts > budget_accounts else 0}")
    
    print("\nOutput files saved to:")
    print(f"  ‚Ä¢ Working data: {Config.OUTPUT_PATH}")
    print(f"  ‚Ä¢ Reports: {Config.REPORTS_PATH}")
    print("="*80)


üöÄ FINANCIAL CLOSE AGENT PIPELINE
   Started: 2026-02-22 23:13:20.397645


üìä DATA VALIDATION
----------------------------------------
‚úÖ Chart of Accounts: 28 rows
   Columns: ['Account_Code', 'Account_Name', 'Account_Type', 'Category', 'Active']
   ‚úì Found account code column: 'Account_Code'
‚úÖ Entity Master: 1 rows
   Columns: ['Entity', 'Entity_Name', 'Country', 'Currency', 'Active']
‚úÖ Cost Center Master: 10 rows
   Columns: ['Cost_Center', 'Cost_Center_Name', 'Department', 'Manager', 'Active']
‚úÖ Budget Data: 60 rows
   Columns: ['Fiscal_Period', 'Entity', 'Account_Code', 'Cost_Center', 'Budget_Amount_AUD', 'Budget_Type', 'Notes']

‚úÖ All master data files validated successfully.


üöÄ T001: Wrangling Raw GL Data
üìÇ T001: Loading raw GL data...
   Loaded 4080 rows
   ‚úì Column names standardized
   ‚úì Dates standardized. Invalid dates: 48
   ‚úì Amounts cleaned. Negative amounts: 96
   ‚úì Embedded exceptions detected: 0
   üíæ Saved 4080 rows to working/GL_Stan

In [16]:
"""
Financial Close Agent - Complete Pipeline
Processes Raw GL Export through all 10 tasks without human intervention
"""

import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import os
import logging
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION AND SETUP
# ============================================================================

class Config:
    """Configuration settings for the agent"""
    RAW_DATA_PATH = "Raw_GL_Export.csv"
    MASTER_DATA_PATH = "Master_Data/"
    REFERENCE_PATH = "Reference/"
    BUDGET_PATH = "Budget/"
    OUTPUT_PATH = "working/"
    REPORTS_PATH = "reports/"
    
    # Fiscal period settings
    CURRENT_FISCAL_PERIOD = "2026-02"
    CURRENT_MONTH = 2
    CURRENT_YEAR = 2026
    
    # Anomaly thresholds
    HIGH_VALUE_THRESHOLD = 50000
    EXTREME_OUTLIER_MULTIPLIER = 5
    SUSPICIOUS_HOUR_START = 22
    SUSPICIOUS_HOUR_END = 6

# ============================================================================
# T001: WRANGLE RAW GL DATA
# ============================================================================

class T001_DataWrangler:
    """Task 1: Parse and standardize raw GL export data"""
    
    def __init__(self):
        self.raw_df = None
        self.standardized_df = None
        self.anomaly_log = []
        
    def load_raw_data(self, filepath):
        """Load raw CSV file"""
        print("üìÇ T001: Loading raw GL data...")
        self.raw_df = pd.read_csv(filepath)
        print(f"   Loaded {len(self.raw_df)} rows")
        return self
    
    def standardize_column_names(self):
        """Convert column names to snake_case"""
        column_mapping = {
            'Txn_ID': 'transaction_id',
            'Posting_Date_Raw': 'posting_date_raw',
            'Invoice_Date_Raw': 'invoice_date_raw',
            'Fiscal_Period': 'fiscal_period',
            'Entity': 'entity_code',
            'Account_Code_Raw': 'account_code_raw',
            'Cost_Center_Raw': 'cost_center_raw',
            'Vendor_Name_Raw': 'vendor_name_raw',
            'Invoice_Number': 'invoice_number',
            'PO_Number': 'po_number',
            'Currency': 'currency_code',
            'Amount': 'amount_raw',
            'Tax_Code': 'tax_code',
            'Narrative': 'narrative',
            'Source_System': 'source_system'
        }
        self.standardized_df = self.raw_df.rename(columns=column_mapping)
        print("   ‚úì Column names standardized")
        return self
    
    def standardize_dates(self):
        """Convert all dates to consistent format YYYY-MM-DD"""
        df = self.standardized_df
        
        def parse_date(date_str, txn_id, column_name):
            if pd.isna(date_str) or date_str in ['INVALID', '99/99/9999', '32/13/2026', '2026-13-45']:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_DATE',
                    'severity': 'CRITICAL',
                    'description': f"Invalid date value: {date_str}",
                    'column': column_name
                })
                return None
            
            # Try different date formats
            formats = [
                '%d-%m-%Y', '%Y-%m-%d', '%d/%m/%Y', '%m/%d/%Y',
                '%d/%m/%y', '%m/%d/%y', '%d-%m-%y', '%y-%m-%d'
            ]
            
            for fmt in formats:
                try:
                    return datetime.strptime(str(date_str), fmt)
                except:
                    continue
            
            # If all formats fail
            self.anomaly_log.append({
                'transaction_id': txn_id,
                'anomaly_type': 'UNPARSABLE_DATE',
                'severity': 'CRITICAL',
                'description': f"Cannot parse date: {date_str}",
                'column': column_name
            })
            return None
        
        # Apply date parsing with transaction_id
        df['posting_date'] = df.apply(
            lambda row: parse_date(row['posting_date_raw'], row['transaction_id'], 'posting_date_raw'), 
            axis=1
        )
        df['invoice_date'] = df.apply(
            lambda row: parse_date(row['invoice_date_raw'], row['transaction_id'], 'invoice_date_raw'), 
            axis=1
        )
        
        # Extract fiscal year and month
        df['fiscal_year'] = df['fiscal_period'].str[:4]
        df['fiscal_month'] = df['fiscal_period'].str[-2:]
        
        # Check fiscal period consistency
        for idx, row in df.iterrows():
            if pd.notna(row['posting_date']):
                posting_month = row['posting_date'].month
                fiscal_month = int(row['fiscal_month']) if pd.notna(row['fiscal_month']) else None
                
                if fiscal_month and posting_month != fiscal_month:
                    self.anomaly_log.append({
                        'transaction_id': row['transaction_id'],
                        'anomaly_type': 'FISCAL_PERIOD_MISMATCH',
                        'severity': 'HIGH',
                        'description': f"Posting date month ({posting_month}) != fiscal period month ({fiscal_month})",
                        'posting_date': row['posting_date'],
                        'fiscal_period': row['fiscal_period']
                    })
        
        print(f"   ‚úì Dates standardized. Invalid dates: {sum(df['posting_date'].isna())}")
        return self
    
    def clean_amounts(self):
        """Convert amount strings to floats"""
        df = self.standardized_df
        
        def parse_amount(amt_str, txn_id):
            if pd.isna(amt_str):
                return None
            
            # Remove currency symbols, commas, spaces
            cleaned = str(amt_str).replace('$', '').replace(',', '').strip()
            
            # Handle negative numbers in parentheses
            if cleaned.startswith('(') and cleaned.endswith(')'):
                cleaned = '-' + cleaned[1:-1]
            
            try:
                return float(cleaned)
            except:
                self.anomaly_log.append({
                    'transaction_id': txn_id,
                    'anomaly_type': 'INVALID_AMOUNT',
                    'severity': 'HIGH',
                    'description': f"Cannot parse amount: {amt_str}"
                })
                return None
        
        df['amount'] = df.apply(
            lambda row: parse_amount(row['amount_raw'], row['transaction_id']), 
            axis=1
        )
        
        # Flag negative amounts
        df['amount_is_negative'] = df['amount'] < 0
        for idx, row in df[df['amount_is_negative']].iterrows():
            self.anomaly_log.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'NEGATIVE_AMOUNT',
                'severity': 'MEDIUM',
                'description': f"Negative amount: {row['amount']}",
                'amount': row['amount']
            })
        
        print(f"   ‚úì Amounts cleaned. Negative amounts: {df['amount_is_negative'].sum()}")
        return self
    
    def detect_embedded_exceptions(self):
        """Look for obvious exceptions in raw data"""
        df = self.standardized_df
        keywords = ['error', 'flag', 'review', 'urgent', 'exception', 'invalid']
        
        df['narrative_lower'] = df['narrative'].str.lower().fillna('')
        
        for idx, row in df.iterrows():
            # Check narrative for keywords
            if any(keyword in str(row['narrative_lower']) for keyword in keywords):
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'NARRATIVE_SUGGESTS_EXCEPTION',
                    'severity': 'MEDIUM',
                    'description': f"Narrative contains exception keywords: {row['narrative']}",
                    'narrative': row['narrative']
                })
            
            # Check for placeholder vendor names
            if row['vendor_name_raw'] in ['Unlisted Company', 'Unknown Vendor LLC', 
                                           'New Vendor XYZ', 'Unregistered Supplier', 
                                           'Mystery Corp']:
                self.anomaly_log.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'PLACEHOLDER_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Placeholder vendor name: {row['vendor_name_raw']}",
                    'vendor': row['vendor_name_raw']
                })
        
        print(f"   ‚úì Embedded exceptions detected: {len([a for a in self.anomaly_log if a['anomaly_type'] == 'NARRATIVE_SUGGESTS_EXCEPTION'])}")
        return self
    
    def add_metadata(self):
        """Add processing metadata"""
        df = self.standardized_df
        df['processing_timestamp'] = datetime.now()
        df['source_file'] = 'Raw_GL_Export.csv'
        df['data_quality_score'] = 100 - (len(self.anomaly_log) / len(df) * 100) if len(df) > 0 else 100
        df['anomaly_count'] = df.apply(lambda row: len([a for a in self.anomaly_log 
                                                          if a.get('transaction_id') == row['transaction_id']]), axis=1)
        return self
    
    def save_output(self):
        """Save standardized data and anomaly log"""
        os.makedirs(Config.OUTPUT_PATH, exist_ok=True)
        os.makedirs(Config.REPORTS_PATH, exist_ok=True)
        
        # Save standardized data
        output_cols = ['transaction_id', 'posting_date_raw', 'posting_date', 'invoice_date_raw',
                       'invoice_date', 'fiscal_period', 'fiscal_year', 'fiscal_month',
                       'entity_code', 'account_code_raw', 'cost_center_raw', 'vendor_name_raw',
                       'invoice_number', 'po_number', 'currency_code', 'amount_raw', 'amount',
                       'amount_is_negative', 'tax_code', 'narrative', 'source_system',
                       'processing_timestamp', 'data_quality_score', 'anomaly_count']
        
        # Only include columns that exist
        available_cols = [col for col in output_cols if col in self.standardized_df.columns]
        self.standardized_df[available_cols].to_csv(
            f"{Config.OUTPUT_PATH}GL_Standardized.csv", index=False
        )
        
        # Save anomaly log
        if self.anomaly_log:
            pd.DataFrame(self.anomaly_log).to_csv(
                f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv", index=False
            )
        
        print(f"   üíæ Saved {len(self.standardized_df)} rows to {Config.OUTPUT_PATH}GL_Standardized.csv")
        print(f"   üíæ Saved {len(self.anomaly_log)} anomalies to {Config.REPORTS_PATH}Input_Anomalies_Detected.csv")
        
        return self.standardized_df, self.anomaly_log
    
    def run(self, filepath):
        """Execute all T001 steps"""
        print("\n" + "="*60)
        print("üöÄ T001: Wrangling Raw GL Data")
        print("="*60)
        
        self.load_raw_data(filepath)
        self.standardize_column_names()
        self.standardize_dates()
        self.clean_amounts()
        self.detect_embedded_exceptions()
        self.add_metadata()
        df, anomalies = self.save_output()
        
        print(f"\n‚úÖ T001 Complete. Processed {len(df)} rows, found {len(anomalies)} anomalies.")
        return df, anomalies


# ============================================================================
# T002: MAP ENTITIES AND ACCOUNTS (FIXED FOR YOUR COLUMN NAMES)
# ============================================================================

class T002_EntityAccountMapper:
    """Task 2: Resolve entity codes and account codes against master data"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.entity_master = None
        self.account_master = None
        self.cost_center_master = None
        self.mapping_anomalies = []
        
    def load_master_data(self):
        """Load master reference files"""
        print("\nüìÇ T002: Loading master data...")
        
        try:
            self.entity_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Entity.csv")
            print(f"   Loaded {len(self.entity_master)} entities")
            print(f"   Entity columns: {list(self.entity_master.columns)}")
        except:
            print("   ‚ö†Ô∏è Entity master not found, creating default")
            self.entity_master = pd.DataFrame({'entity_code': ['AUS01']})
        
        try:
            self.account_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_COA.csv")
            print(f"   Loaded {len(self.account_master)} accounts")
            print(f"   Account columns: {list(self.account_master.columns)}")
            
            # Standardize column names - convert to lowercase for easier matching
            self.account_master.columns = [col.lower().strip() for col in self.account_master.columns]
            
            # Map the account code column (which might be 'account_code' or 'account_code' after lowercasing)
            if 'account_code' not in self.account_master.columns:
                # Check for alternative names
                if 'account_code' in self.account_master.columns:
                    self.account_master.rename(columns={'account_code': 'account_code'}, inplace=True)
                elif 'account' in self.account_master.columns:
                    self.account_master.rename(columns={'account': 'account_code'}, inplace=True)
                elif 'code' in self.account_master.columns:
                    self.account_master.rename(columns={'code': 'account_code'}, inplace=True)
                else:
                    print(f"   ‚ö†Ô∏è Could not find account code column. Using first column as account_code")
                    first_col = self.account_master.columns[0]
                    self.account_master.rename(columns={first_col: 'account_code'}, inplace=True)
            
            print(f"   Using '{self.account_master.columns[0]}' as account code column")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Account master not found or error: {e}")
            print("   Creating default account master")
            self.account_master = pd.DataFrame({'account_code': [f"{i:04d}" for i in range(5000, 5029)]})
        
        try:
            self.cost_center_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv")
            print(f"   Loaded {len(self.cost_center_master)} cost centers")
            print(f"   Cost center columns: {list(self.cost_center_master.columns)}")
            
            # Standardize cost center column
            self.cost_center_master.columns = [col.lower().strip() for col in self.cost_center_master.columns]
            
            if 'cost_center' not in self.cost_center_master.columns:
                if 'costcenter' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'costcenter': 'cost_center'}, inplace=True)
                elif 'cc' in self.cost_center_master.columns:
                    self.cost_center_master.rename(columns={'cc': 'cost_center'}, inplace=True)
                else:
                    # Use first column as cost center
                    first_col = self.cost_center_master.columns[0]
                    self.cost_center_master.rename(columns={first_col: 'cost_center'}, inplace=True)
                    
        except Exception as e:
            print(f"   ‚ö†Ô∏è Cost center master not found or error: {e}")
            print("   Creating default cost center master")
            self.cost_center_master = pd.DataFrame({'cost_center': ['CC' + str(i).zfill(4) for i in range(1000, 1010)]})
        
        return self
    
    def map_entities(self):
        """Map entity codes against master"""
        # Handle entity master columns
        if 'entity_code' not in self.entity_master.columns:
            # Try to find entity code column
            for col in self.entity_master.columns:
                if 'entity' in col.lower() or 'code' in col.lower():
                    self.entity_master.rename(columns={col: 'entity_code'}, inplace=True)
                    break
        
        valid_entities = self.entity_master['entity_code'].tolist() if 'entity_code' in self.entity_master.columns else ['AUS01']
        
        self.df['entity_valid'] = self.df['entity_code'].isin(valid_entities)
        self.df['entity_code_mapped'] = np.where(
            self.df['entity_valid'], 
            self.df['entity_code'], 
            None
        )
        
        for idx, row in self.df[~self.df['entity_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ENTITY',
                'severity': 'CRITICAL',
                'description': f"Entity code '{row['entity_code']}' not in master",
                'original_value': row['entity_code']
            })
        
        print(f"   ‚úì Entities mapped. Invalid: {(~self.df['entity_valid']).sum()}")
        return self
    
    def map_accounts(self):
        """Map account codes against master with better matching"""
        
        # Get valid account codes from master
        if 'account_code' in self.account_master.columns:
            # Convert master account codes to strings and strip
            valid_accounts = [str(acct).strip() for acct in self.account_master['account_code'].tolist()]
            
            # Also try without leading/trailing spaces
            valid_accounts.extend([acct for acct in valid_accounts if acct != acct.strip()])
            valid_accounts = list(set(valid_accounts))  # Remove duplicates
            
            print(f"   Sample valid accounts: {valid_accounts[:5]}")
        else:
            print("   ‚ö†Ô∏è No account_code column found in master")
            valid_accounts = []
        
        # Clean raw account codes for comparison
        self.df['account_code_clean'] = self.df['account_code_raw'].astype(str).str.strip()
        
        # Try different matching strategies
        self.df['account_valid'] = False
        
        # Strategy 1: Direct match
        direct_match = self.df['account_code_raw'].isin(valid_accounts)
        self.df.loc[direct_match, 'account_valid'] = True
        
        # Strategy 2: Clean match
        clean_match = (~direct_match) & self.df['account_code_clean'].isin(valid_accounts)
        self.df.loc[clean_match, 'account_valid'] = True
        
        # Strategy 3: Numeric match (if both are numbers)
        if not self.df[~self.df['account_valid']].empty:
            # Convert valid accounts to numeric where possible
            numeric_valid = []
            for acct in valid_accounts:
                try:
                    numeric_valid.append(float(acct))
                except:
                    pass
            
            for idx, row in self.df[~self.df['account_valid']].iterrows():
                try:
                    raw_num = float(row['account_code_raw'])
                    if raw_num in numeric_valid:
                        self.df.at[idx, 'account_valid'] = True
                except:
                    pass
        
        # Assign mapped account codes
        def find_matching_account(row):
            if row['account_valid']:
                # Return the original if it's valid
                if row['account_code_raw'] in valid_accounts:
                    return row['account_code_raw']
                elif row['account_code_clean'] in valid_accounts:
                    return row['account_code_clean']
                else:
                    # Try to find numeric match
                    try:
                        raw_num = float(row['account_code_raw'])
                        for acct in valid_accounts:
                            try:
                                if float(acct) == raw_num:
                                    return acct
                            except:
                                continue
                    except:
                        pass
                    return row['account_code_raw']  # Return original if can't find better match
            return None
        
        self.df['account_code_mapped'] = self.df.apply(find_matching_account, axis=1)
        
        # Get account names/descriptions if available
        if 'account_name' in self.account_master.columns:
            # Create mapping dictionary
            account_desc_map = {}
            for _, row in self.account_master.iterrows():
                acct = str(row['account_code']).strip()
                desc = row['account_name']
                account_desc_map[acct] = desc
                # Also add without leading zeros
                if acct.isdigit():
                    account_desc_map[str(int(acct))] = desc
            
            self.df['account_description'] = self.df['account_code_mapped'].map(account_desc_map)
            print(f"   Added account descriptions")
        
        # Log anomalies for invalid accounts
        invalid_count = (~self.df['account_valid']).sum()
        for idx, row in self.df[~self.df['account_valid']].iterrows():
            severity = 'CRITICAL' if str(row['account_code_raw']) == 'INVALID_ACCT' else 'HIGH'
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_ACCOUNT',
                'severity': severity,
                'description': f"Account code '{row['account_code_raw']}' not in Chart of Accounts",
                'original_value': row['account_code_raw'],
                'amount': row['amount']
            })
        
        print(f"   ‚úì Accounts mapped. Valid: {self.df['account_valid'].sum()}, Invalid: {invalid_count}")
        return self
    
    def map_cost_centers(self):
        """Map cost centers against master"""
        if 'cost_center' in self.cost_center_master.columns:
            valid_centers = self.cost_center_master['cost_center'].tolist()
        else:
            valid_centers = []
        
        # Handle missing cost centers
        self.df['cost_center_present'] = self.df['cost_center_raw'].notna() & (self.df['cost_center_raw'] != '')
        self.df['cost_center_valid'] = self.df['cost_center_raw'].isin(valid_centers) if valid_centers else self.df['cost_center_present']
        self.df['cost_center_mapped'] = np.where(
            self.df['cost_center_valid'],
            self.df['cost_center_raw'],
            None
        )
        
        for idx, row in self.df[~self.df['cost_center_present']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'MISSING_COST_CENTER',
                'severity': 'MEDIUM',
                'description': "Cost center is missing",
                'amount': row['amount']
            })
        
        for idx, row in self.df[self.df['cost_center_present'] & ~self.df['cost_center_valid']].iterrows():
            self.mapping_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'INVALID_COST_CENTER',
                'severity': 'HIGH',
                'description': f"Cost center '{row['cost_center_raw']}' not in master",
                'original_value': row['cost_center_raw']
            })
        
        print(f"   ‚úì Cost centers mapped. Missing: {(~self.df['cost_center_present']).sum()}, Invalid: {(self.df['cost_center_present'] & ~self.df['cost_center_valid']).sum()}")
        return self
    
    def save_output(self):
        """Save mapped data"""
        # Update anomaly log with new anomalies
        existing_anomalies = pd.read_csv(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") if os.path.exists(f"{Config.REPORTS_PATH}Input_Anomalies_Detected.csv") else pd.DataFrame()
        
        all_anomalies = pd.concat([
            existing_anomalies, 
            pd.DataFrame(self.mapping_anomalies)
        ], ignore_index=True)
        
        all_anomalies.to_csv(f"{Config.REPORTS_PATH}Exceptions_Log.csv", index=False)
        
        # Save enriched data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithMappings.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_WithMappings.csv")
        print(f"   üíæ Updated exceptions log with {len(self.mapping_anomalies)} new anomalies")
        
        return self.df
    
    def run(self):
        """Execute all T002 steps"""
        print("\n" + "="*60)
        print("üöÄ T002: Mapping Entities and Accounts")
        print("="*60)
        
        self.load_master_data()
        self.map_entities()
        self.map_accounts()
        self.map_cost_centers()
        df = self.save_output()
        
        print(f"\n‚úÖ T002 Complete. Mapped {len(df)} transactions.")
        return df


# ============================================================================
# T003: RESOLVE VENDOR NAMES (FIXED FOR YOUR COLUMN NAMES)
# ============================================================================

class T003_VendorResolver:
    """Task 3: Map vendor aliases to canonical vendor names"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.vendor_master = None
        self.alias_map = None
        self.vendor_anomalies = []
        
    def load_vendor_data(self):
        """Load vendor master and alias mapping"""
        print("\nüìÇ T003: Loading vendor data...")
        
        try:
            self.vendor_master = pd.read_csv(f"{Config.MASTER_DATA_PATH}Master_Vendors.csv")
            print(f"   Loaded {len(self.vendor_master)} canonical vendors")
            print(f"   Vendor master columns: {list(self.vendor_master.columns)}")
            
            # Standardize column names
            self.vendor_master.columns = [col.lower().strip() for col in self.vendor_master.columns]
            
            # Map to expected column names
            if 'vendor_name_canonical' in self.vendor_master.columns:
                self.vendor_master.rename(columns={'vendor_name_canonical': 'canonical_vendor'}, inplace=True)
                print(f"   Using 'vendor_name_canonical' as canonical vendor column")
            elif 'vendor_name' in self.vendor_master.columns:
                self.vendor_master.rename(columns={'vendor_name': 'canonical_vendor'}, inplace=True)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Vendor master not found or error: {e}")
            print("   Creating default vendor master")
            self.vendor_master = pd.DataFrame({'canonical_vendor': ['Unknown']})
        
        try:
            self.alias_map = pd.read_csv(f"{Config.MASTER_DATA_PATH}Vendor_Alias_Map.csv")
            print(f"   Loaded {len(self.alias_map)} alias mappings")
            print(f"   Alias map columns: {list(self.alias_map.columns)}")
            
            # Standardize column names
            self.alias_map.columns = [col.lower().strip() for col in self.alias_map.columns]
            
            # Map to expected column names
            if 'vendor_name_raw' in self.alias_map.columns:
                self.alias_map.rename(columns={'vendor_name_raw': 'alias'}, inplace=True)
            
            if 'vendor_name_canonical' in self.alias_map.columns:
                self.alias_map.rename(columns={'vendor_name_canonical': 'canonical_vendor'}, inplace=True)
            
            print(f"   Alias map now has columns: {list(self.alias_map.columns)}")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Alias map not found or error: {e}")
            self.alias_map = pd.DataFrame({'alias': [], 'canonical_vendor': []})
        
        return self
    
    def build_alias_dict(self):
        """Create lookup dictionary from aliases to canonical names"""
        alias_dict = {}
        
        # Build from alias map
        if self.alias_map is not None and len(self.alias_map) > 0:
            # Check if required columns exist
            if 'alias' in self.alias_map.columns and 'canonical_vendor' in self.alias_map.columns:
                for _, row in self.alias_map.iterrows():
                    # Store multiple variations of the alias
                    alias_raw = str(row['alias']).strip()
                    alias_lower = alias_raw.lower()
                    alias_dict[alias_lower] = row['canonical_vendor']
                    
                    # Also store without common suffixes
                    for suffix in [' pty', ' ltd', ' inc', ' corp', ' llc', ' australia', ' usa', ' uk']:
                        if alias_lower.endswith(suffix):
                            alias_dict[alias_lower[:-len(suffix)]] = row['canonical_vendor']
                    
                    # Store first word for partial matching
                    first_word = alias_lower.split()[0] if alias_lower else ''
                    if first_word and len(first_word) > 3:
                        alias_dict[first_word] = row['canonical_vendor']
        
        # Add self-mappings for exact matches from vendor master
        if self.vendor_master is not None and 'canonical_vendor' in self.vendor_master.columns:
            for vendor in self.vendor_master['canonical_vendor'].dropna():
                vendor_lower = vendor.lower()
                alias_dict[vendor_lower] = vendor
                
                # Also store without common suffixes
                for suffix in [' pty', ' ltd', ' inc', ' corp', ' llc']:
                    if vendor_lower.endswith(suffix):
                        alias_dict[vendor_lower[:-len(suffix)]] = vendor
        
        print(f"   Built alias dictionary with {len(alias_dict)} entries")
        return alias_dict
    
    def resolve_vendors(self):
        """Apply vendor mapping with improved matching"""
        alias_dict = self.build_alias_dict()
        
        # Get list of canonical vendor names for fuzzy matching
        if 'canonical_vendor' in self.vendor_master.columns:
            canonical_list = self.vendor_master['canonical_vendor'].dropna().unique().tolist()
        else:
            canonical_list = []
        
        print(f"   Canonical vendor list has {len(canonical_list)} entries")
        
        def resolve(vendor_raw):
            if pd.isna(vendor_raw) or vendor_raw == '':
                return None, 'MISSING'
            
            vendor_original = str(vendor_raw).strip()
            vendor_lower = vendor_original.lower()
            
            # STRATEGY 1: Direct alias match
            if vendor_lower in alias_dict:
                return alias_dict[vendor_lower], 'MAPPED'
            
            # STRATEGY 2: Check if it's already a canonical name
            if vendor_original in canonical_list:
                return vendor_original, 'CANONICAL'
            
            # STRATEGY 3: Check cleaned version (remove special characters)
            import re
            vendor_clean = re.sub(r'[^\w\s]', '', vendor_lower)
            if vendor_clean in alias_dict:
                return alias_dict[vendor_clean], 'CLEANED_MATCH'
            
            # STRATEGY 4: Try partial matching (contains)
            for canonical in canonical_list:
                canonical_lower = canonical.lower()
                # Check if canonical name is contained in vendor name
                if canonical_lower in vendor_lower:
                    return canonical, 'PARTIAL_MATCH'
                # Check if vendor name is contained in canonical name
                if len(vendor_lower) > 5 and vendor_lower in canonical_lower:
                    return canonical, 'PARTIAL_MATCH'
            
            # STRATEGY 5: Try word-by-word matching
            vendor_words = set(vendor_lower.split())
            best_match = None
            best_match_score = 0
            
            for canonical in canonical_list:
                canonical_words = set(canonical.lower().split())
                # Calculate Jaccard similarity
                intersection = len(vendor_words.intersection(canonical_words))
                union = len(vendor_words.union(canonical_words))
                
                if union > 0:
                    score = intersection / union
                    if score > 0.5 and score > best_match_score:  # 50% word overlap
                        best_match = canonical
                        best_match_score = score
            
            if best_match:
                return best_match, f'WORD_MATCH_{best_match_score:.0%}'
            
            # No match found
            return None, 'UNMAPPED'
        
        # Apply resolution
        print("   Resolving vendors (this may take a moment)...")
        results = self.df['vendor_name_raw'].apply(resolve)
        self.df['vendor_canonical'] = [r[0] for r in results]
        self.df['vendor_resolution_status'] = [r[1] for r in results]
        
        # Log anomalies
        for idx, row in self.df.iterrows():
            if row['vendor_resolution_status'] == 'MISSING':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'MISSING_VENDOR',
                    'severity': 'HIGH',
                    'description': 'Vendor name is missing',
                    'amount': row['amount']
                })
            elif row['vendor_resolution_status'] == 'UNMAPPED':
                self.vendor_anomalies.append({
                    'transaction_id': row['transaction_id'],
                    'anomaly_type': 'UNMAPPED_VENDOR',
                    'severity': 'HIGH',
                    'description': f"Vendor '{row['vendor_name_raw']}' not found in alias map",
                    'original_value': row['vendor_name_raw'],
                    'amount': row['amount']
                })
        
        # Calculate statistics
        mapped_count = self.df['vendor_resolution_status'].isin(['MAPPED', 'CANONICAL', 'CLEANED_MATCH', 'PARTIAL_MATCH']).sum()
        word_match_count = self.df['vendor_resolution_status'].str.contains('WORD_MATCH', na=False).sum()
        unmapped_count = (self.df['vendor_resolution_status'] == 'UNMAPPED').sum()
        missing_count = (self.df['vendor_resolution_status'] == 'MISSING').sum()
        
        print(f"\n   üìä Vendor Resolution Results:")
        print(f"   ‚Ä¢ Direct matches: {mapped_count}")
        print(f"   ‚Ä¢ Word matches: {word_match_count}")
        print(f"   ‚Ä¢ Unmapped: {unmapped_count}")
        print(f"   ‚Ä¢ Missing: {missing_count}")
        
        # Show sample of unmapped vendors for debugging
        if unmapped_count > 0:
            unmapped_samples = self.df[self.df['vendor_resolution_status'] == 'UNMAPPED']['vendor_name_raw'].dropna().unique()[:10]
            print(f"\n   Sample unmapped vendors: {list(unmapped_samples)}")
        
        return self
    
    def save_output(self):
        """Save vendor-resolved data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.vendor_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.vendor_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_VendorsResolved.csv", index=False)
        
        print(f"   üíæ Saved to {Config.OUTPUT_PATH}GL_VendorsResolved.csv")
        
        return self.df
    
    def run(self):
        """Execute all T003 steps"""
        print("\n" + "="*60)
        print("üöÄ T003: Resolving Vendor Names")
        print("="*60)
        
        self.load_vendor_data()
        self.resolve_vendors()
        df = self.save_output()
        
        print(f"\n‚úÖ T003 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T004: APPLY FX CONVERSION (FIXED FOR YOUR FX_RATES.CSV)
# ============================================================================

class T004_FXConverter:
    """Task 4: Convert all transactions to AUD"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.fx_rates = None
        self.fx_anomalies = []
        
    def load_fx_rates(self):
        """Load foreign exchange rates with flexible column mapping"""
        print("\nüìÇ T004: Loading FX rates...")
        
        try:
            self.fx_rates = pd.read_csv(f"{Config.REFERENCE_PATH}FX_Rates.csv")
            print(f"   Loaded {len(self.fx_rates)} FX rates")
            print(f"   Original columns: {list(self.fx_rates.columns)}")
            
            # Standardize column names to lowercase
            self.fx_rates.columns = [col.lower().strip() for col in self.fx_rates.columns]
            
            # Map period column - your file uses 'fiscal_period'
            if 'fiscal_period' in self.fx_rates.columns:
                self.fx_rates.rename(columns={'fiscal_period': 'period'}, inplace=True)
                print(f"   Using 'fiscal_period' as period column")
            else:
                print(f"   ‚ö†Ô∏è No period column found")
                self.fx_rates['period'] = 'ALL'
            
            # Map currency column - your file uses 'currency'
            if 'currency' in self.fx_rates.columns:
                print(f"   Using 'currency' as currency column")
            else:
                print(f"   ‚ùå No currency column found")
                raise ValueError("Cannot find currency column in FX rates")
            
            # Map rate column - your file uses 'rate_to_aud'
            if 'rate_to_aud' in self.fx_rates.columns:
                self.fx_rates.rename(columns={'rate_to_aud': 'rate'}, inplace=True)
                print(f"   Using 'rate_to_aud' as rate column (1 foreign currency = X AUD)")
            else:
                print(f"   ‚ùå No rate column found")
                raise ValueError("Cannot find rate column in FX rates")
            
            # Ensure period is string for joining
            self.fx_rates['period'] = self.fx_rates['period'].astype(str)
            
            print(f"\n   Final columns after mapping: {list(self.fx_rates.columns)}")
            print(f"   Sample rates for {Config.CURRENT_FISCAL_PERIOD}:")
            
            # Show rates for current period
            current_rates = self.fx_rates[self.fx_rates['period'] == Config.CURRENT_FISCAL_PERIOD]
            if not current_rates.empty:
                for _, row in current_rates.iterrows():
                    print(f"   ‚Ä¢ {row['currency']}: 1 {row['currency']} = {row['rate']:.4f} AUD")
            else:
                print(f"   No rates found for {Config.CURRENT_FISCAL_PERIOD}, will use most recent")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error loading FX rates: {e}")
            print("   Creating default rates based on common currencies...")
            
            # Create default rates
            periods = self.df['fiscal_period'].unique()
            currencies = self.df['currency_code'].unique()
            
            rates_data = []
            for period in periods:
                for currency in currencies:
                    if currency == 'AUD':
                        rate = 1.0
                    elif currency == 'USD':
                        rate = 1.5250  # Based on your Feb 2026 rate
                    elif currency == 'GBP':
                        rate = 1.9550  # Based on your Feb 2026 rate
                    elif currency == 'NZD':
                        rate = 0.9320  # Based on your Feb 2026 rate
                    elif currency == 'EUR':
                        rate = 1.62    # Approximate (not in your file)
                    else:
                        rate = 1.0
                    
                    rates_data.append({
                        'period': period,
                        'currency': currency,
                        'rate': rate
                    })
            
            self.fx_rates = pd.DataFrame(rates_data)
            print(f"   Created default rates for {len(self.fx_rates)} currency-period combinations")
        
        return self
    
    def convert_to_aud(self):
        """Convert amounts to AUD"""
        
        # Create lookup key
        self.df['fx_key'] = self.df['fiscal_period'] + '_' + self.df['currency_code']
        self.fx_rates['fx_key'] = self.fx_rates['period'].astype(str) + '_' + self.fx_rates['currency']
        
        # Create rate lookup dictionary
        rate_dict = dict(zip(self.fx_rates['fx_key'], self.fx_rates['rate']))
        
        def get_rate(row):
            if row['currency_code'] == 'AUD':
                return 1.0
            
            # Try exact period match first
            key = row['fx_key']
            if key in rate_dict:
                return rate_dict[key]
            
            # Try to find the most recent rate for this currency
            currency_rates = {k: v for k, v in rate_dict.items() if k.endswith('_' + row['currency_code'])}
            if currency_rates:
                # Sort by period and take the most recent
                sorted_rates = sorted(currency_rates.items(), key=lambda x: x[0], reverse=True)
                rate = sorted_rates[0][1]
                print(f"   ‚ÑπÔ∏è Using {sorted_rates[0][0]} rate for {row['currency_code']} (most recent available)")
                return rate
            
            # No rate found
            self.fx_anomalies.append({
                'transaction_id': row['transaction_id'],
                'anomaly_type': 'MISSING_FX_RATE',
                'severity': 'CRITICAL',
                'description': f"No FX rate found for {row['currency_code']} in period {row['fiscal_period']}",
                'currency': row['currency_code'],
                'period': row['fiscal_period'],
                'amount': row['amount']
            })
            return None
        
        # Apply conversion
        print("\n   Applying FX conversion...")
        self.df['fx_rate'] = self.df.apply(get_rate, axis=1)
        self.df['amount_aud'] = np.where(
            self.df['fx_rate'].notna(),
            self.df['amount'] * self.df['fx_rate'],
            None
        )
        
        # Flag conversion issues
        self.df['conversion_status'] = np.where(
            self.df['currency_code'] == 'AUD', 'DOMESTIC',
            np.where(self.df['fx_rate'].notna(), 'CONVERTED', 'FAILED')
        )
        
        converted = (self.df['conversion_status'] == 'CONVERTED').sum()
        failed = (self.df['conversion_status'] == 'FAILED').sum()
        domestic = (self.df['conversion_status'] == 'DOMESTIC').sum()
        
        print(f"\n   ‚úì FX conversion complete. Domestic: {domestic}, Converted: {converted}, Failed: {failed}")
        
        # Show sample of conversions
        if converted > 0:
            print("\n   Sample conversions (using your actual FX rates):")
            sample_conversions = self.df[
                (self.df['conversion_status'] == 'CONVERTED') & 
                (self.df['currency_code'] != 'AUD')
            ].head(5)
            
            for _, row in sample_conversions.iterrows():
                print(f"   ‚Ä¢ {row['currency_code']} {row['amount']:,.2f} ‚Üí ${row['amount_aud']:,.2f} AUD (rate: {row['fx_rate']:.4f})")
        
        return self
    
    def save_output(self):
        """Save converted data"""
        # Update exceptions log
        exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        if os.path.exists(exceptions_path):
            existing = pd.read_csv(exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(self.fx_anomalies)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(self.fx_anomalies)
        
        all_exceptions.to_csv(exceptions_path, index=False)
        
        # Save data
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_Converted.csv", index=False)
        
        print(f"\n   üíæ Saved to {Config.OUTPUT_PATH}GL_Converted.csv")
        
        return self.df
    
    def run(self):
        """Execute all T004 steps"""
        print("\n" + "="*60)
        print("üöÄ T004: Applying FX Conversion")
        print("="*60)
        
        self.load_fx_rates()
        self.convert_to_aud()
        df = self.save_output()
        
        print(f"\n‚úÖ T004 Complete. Processed {len(df)} transactions.")
        return df


# ============================================================================
# T005: DETECT EXCEPTIONS
# ============================================================================

class T005_ExceptionDetector:
    """Task 5: Run exception rules and flag violations"""
    
    def __init__(self, working_df):
        self.df = working_df.copy()
        self.rulebook = None
        self.exception_results = []
        
    def load_rulebook(self):
        """Load exception rules"""
        print("\nüìÇ T005: Loading exception rulebook...")
        
        try:
            self.rulebook = pd.read_csv(f"{Config.REFERENCE_PATH}Exception_Rulebook.csv")
            print(f"   Loaded {len(self.rulebook)} exception rules")
            
            # Check if required columns exist, if not, create default rule IDs
            if 'rule_id' not in self.rulebook.columns:
                self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                print(f"   Added default rule_id column")
                
        except Exception as e:
            print(f"   ‚ö†Ô∏è Rulebook not found or error loading: {e}")
            # Create default rules
            self.rulebook = pd.DataFrame([
                {'rule_id': 'EX001', 'rule_name': 'Missing PO Number', 
                 'severity': 'HIGH', 'logic': 'po_number is None or po_number == ""',
                 'description': 'Transaction has no purchase order number'},
                {'rule_id': 'EX002', 'rule_name': 'Missing Cost Center',
                 'severity': 'MEDIUM', 'logic': 'cost_center_mapped is None',
                 'description': 'Transaction has no cost center allocation'},
                {'rule_id': 'EX003', 'rule_name': 'Invalid Account',
                 'severity': 'CRITICAL', 'logic': 'account_code_mapped is None',
                 'description': 'Account code not in Chart of Accounts'},
                {'rule_id': 'EX004', 'rule_name': 'High Value Transaction',
                 'severity': 'MEDIUM', 'logic': f'amount_aud > {Config.HIGH_VALUE_THRESHOLD}',
                 'description': f'Transaction exceeds ${Config.HIGH_VALUE_THRESHOLD:,}'},
                {'rule_id': 'EX005', 'rule_name': 'Negative Amount',
                 'severity': 'MEDIUM', 'logic': 'amount_is_negative == True',
                 'description': 'Transaction has negative amount'},
                {'rule_id': 'EX006', 'rule_name': 'Unmapped Vendor',
                 'severity': 'HIGH', 'logic': 'vendor_resolution_status == "UNMAPPED"',
                 'description': 'Vendor not found in master data'},
                {'rule_id': 'EX007', 'rule_name': 'Future Dated Transaction',
                 'severity': 'HIGH', 'logic': 'posting_date > current_date and fiscal_period == current_period',
                 'description': 'Transaction date is in future but in current period'},
                {'rule_id': 'EX008', 'rule_name': 'Invalid Date',
                 'severity': 'CRITICAL', 'logic': 'posting_date is None',
                 'description': 'Posting date is invalid or missing'},
                {'rule_id': 'EX009', 'rule_name': 'Missing Tax Code',
                 'severity': 'MEDIUM', 'logic': 'tax_code is None or tax_code == ""',
                 'description': 'Tax code is missing'},
                {'rule_id': 'EX010', 'rule_name': 'Extreme Outlier',
                 'severity': 'MEDIUM', 'logic': 'is_outlier == True',
                 'description': 'Amount is significantly outside normal range'},
            ])
            print(f"   Created {len(self.rulebook)} default exception rules")
        
        # Ensure all required columns exist
        required_cols = ['rule_id', 'rule_name', 'severity', 'description']
        for col in required_cols:
            if col not in self.rulebook.columns:
                if col == 'rule_id':
                    self.rulebook['rule_id'] = [f'EX{i+1:03d}' for i in range(len(self.rulebook))]
                elif col == 'rule_name':
                    self.rulebook['rule_name'] = [f'Rule {i+1}' for i in range(len(self.rulebook))]
                elif col == 'severity':
                    self.rulebook['severity'] = 'MEDIUM'
                elif col == 'description':
                    self.rulebook['description'] = self.rulebook.get('rule_name', 'No description')
        
        print(f"   Ready with {len(self.rulebook)} rules")
        return self
    
    def detect_outliers(self):
        """Statistical outlier detection"""
        # Group by account to find normal ranges
        account_stats = self.df.groupby('account_code_mapped')['amount_aud'].agg(['mean', 'std', 'count']).reset_index()
        account_stats.columns = ['account_code_mapped', 'mean_amount', 'std_amount', 'txn_count']
        
        # Merge stats back
        self.df = self.df.merge(account_stats, on='account_code_mapped', how='left')
        
        # Flag outliers (beyond 3 standard deviations)
        self.df['is_outlier'] = np.where(
            (self.df['std_amount'] > 0) & 
            (self.df['amount_aud'].notna()) &
            (abs(self.df['amount_aud'] - self.df['mean_amount']) > Config.EXTREME_OUTLIER_MULTIPLIER * self.df['std_amount']),
            True,
            False
        )
        
        print(f"   ‚úì Outlier detection complete. Found {self.df['is_outlier'].sum()} outliers")
        return self
    
    def detect_temporal_anomalies(self):
        """Detect unusual timing patterns"""
        # Extract hour from posting date if available
        self.df['posting_hour'] = self.df['posting_date'].dt.hour
        self.df['posting_day'] = self.df['posting_date'].dt.day_name()
        self.df['posting_weekend'] = self.df['posting_date'].dt.dayofweek.isin([5, 6])
        
        # Flag suspicious hours (late night/early morning)
        self.df['suspicious_hour'] = (
            self.df['posting_hour'].notna() & 
            ((self.df['posting_hour'] >= Config.SUSPICIOUS_HOUR_START) | 
             (self.df['posting_hour'] <= Config.SUSPICIOUS_HOUR_END))
        )
        
        return self
    
    def apply_rules(self):
        """Apply all exception rules"""
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)  # Approx month end
        
        # Create a dictionary of rule logic functions
        rule_functions = {
            'EX001': lambda row: pd.isna(row['po_number']) or row['po_number'] == '',
            'EX002': lambda row: pd.isna(row['cost_center_mapped']),
            'EX003': lambda row: pd.isna(row['account_code_mapped']),
            'EX004': lambda row: row['amount_aud'] > Config.HIGH_VALUE_THRESHOLD if pd.notna(row['amount_aud']) else False,
            'EX005': lambda row: row.get('amount_is_negative', False),
            'EX006': lambda row: row.get('vendor_resolution_status') == 'UNMAPPED',
            'EX007': lambda row: (pd.notna(row['posting_date']) and 
                                  row['posting_date'] > current_date and 
                                  row['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD),
            'EX008': lambda row: pd.isna(row['posting_date']),
            'EX009': lambda row: pd.isna(row['tax_code']) or row['tax_code'] == '',
            'EX010': lambda row: row.get('is_outlier', False),
        }
        
        for _, rule in self.rulebook.iterrows():
            rule_id = rule['rule_id']
            rule_name = rule.get('rule_name', f'Rule {rule_id}')
            severity = rule.get('severity', 'MEDIUM')
            description = rule.get('description', rule_name)
            
            # Get the rule function
            rule_func = rule_functions.get(rule_id)
            if rule_func is None:
                # Skip rules we don't have logic for
                continue
            
            # Apply rule
            for idx, row in self.df.iterrows():
                try:
                    if rule_func(row):
                        self.exception_results.append({
                            'transaction_id': row['transaction_id'],
                            'rule_id': rule_id,
                            'rule_name': rule_name,
                            'severity': severity,
                            'description': description,
                            'amount': row.get('amount_aud', 0),
                            'vendor': row.get('vendor_name_raw', ''),
                            'account': row.get('account_code_raw', '')
                        })
                except Exception as e:
                    # Log rule application error but continue
                    print(f"   ‚ö†Ô∏è Error applying rule {rule_id} to transaction {row['transaction_id']}: {e}")
                    continue
        
        # Also add any existing anomalies from previous steps
        for idx, row in self.df.iterrows():
            if row.get('amount_is_negative', False):
                # Check if already added by rule EX005
                exists = any(e['transaction_id'] == row['transaction_id'] and e['rule_id'] == 'EX005' 
                            for e in self.exception_results)
                if not exists:
                    self.exception_results.append({
                        'transaction_id': row['transaction_id'],
                        'rule_id': 'EX005',
                        'rule_name': 'Negative Amount',
                        'severity': 'MEDIUM',
                        'description': 'Transaction has negative amount',
                        'amount': row.get('amount_aud', 0),
                        'vendor': row.get('vendor_name_raw', ''),
                        'account': row.get('account_code_raw', '')
                    })
        
        print(f"   ‚úì Applied rules, found {len(self.exception_results)} exceptions")
        return self
    
    def save_output(self):
        """Save exception results"""
        # Add exception flags to dataframe
        exception_txns = [e['transaction_id'] for e in self.exception_results]
        self.df['has_exception'] = self.df['transaction_id'].isin(exception_txns)
        
        # Group exceptions by transaction
        exception_summary = {}
        for e in self.exception_results:
            txn = e['transaction_id']
            if txn not in exception_summary:
                exception_summary[txn] = []
            exception_summary[txn].append(e['rule_id'])
        
        self.df['exception_rules'] = self.df['transaction_id'].map(
            lambda x: ';'.join(exception_summary.get(x, []))
        )
        
        # Save data with flags
        self.df.to_csv(f"{Config.OUTPUT_PATH}GL_WithExceptions.csv", index=False)
        
        # Save exception log
        if self.exception_results:
            exceptions_df = pd.DataFrame(self.exception_results)
            exceptions_df.to_csv(f"{Config.REPORTS_PATH}Exceptions_Detailed.csv", index=False)
        
        # Update master exceptions log
        master_exceptions_path = f"{Config.REPORTS_PATH}Exceptions_Log.csv"
        
        # Convert new exceptions to simple format
        new_exceptions = []
        for e in self.exception_results:
            new_exceptions.append({
                'transaction_id': e['transaction_id'],
                'anomaly_type': e['rule_id'],
                'severity': e['severity'],
                'description': e['description'],
                'amount': e.get('amount', 0)
            })
        
        if os.path.exists(master_exceptions_path):
            existing = pd.read_csv(master_exceptions_path)
            all_exceptions = pd.concat([existing, pd.DataFrame(new_exceptions)], ignore_index=True)
        else:
            all_exceptions = pd.DataFrame(new_exceptions)
        
        all_exceptions.to_csv(master_exceptions_path, index=False)
        
        print(f"   üíæ Saved exception data")
        
        return self.df, self.exception_results
    
    def run(self):
        """Execute all T005 steps"""
        print("\n" + "="*60)
        print("üöÄ T005: Detecting Exceptions")
        print("="*60)
        
        self.load_rulebook()
        self.detect_outliers()
        self.detect_temporal_anomalies()
        self.apply_rules()
        df, exceptions = self.save_output()
        
        # Severity counts
        if exceptions:
            severity_counts = {}
            for e in exceptions:
                sev = e.get('severity', 'UNKNOWN')
                severity_counts[sev] = severity_counts.get(sev, 0) + 1
            
            print(f"\n‚úÖ T005 Complete. Exceptions by severity:")
            for severity, count in severity_counts.items():
                print(f"   {severity}: {count}")
        else:
            print(f"\n‚úÖ T005 Complete. No exceptions found.")
        
        return df, exceptions

# ============================================================================
# T006: REVIEW HIGH SEVERITY EXCEPTIONS (Automated version - no human review)
# ============================================================================

class T006_ExceptionReviewer:
    """Task 6: Review and categorize exceptions (automated)"""
    
    def __init__(self, df, exceptions):
        self.df = df.copy()
        self.exceptions = exceptions
        self.critical_exceptions = []
        self.high_exceptions = []
        
    def categorize_exceptions(self):
        """Split exceptions by severity"""
        for e in self.exceptions:
            if e['severity'] == 'CRITICAL':
                self.critical_exceptions.append(e)
            elif e['severity'] == 'HIGH':
                self.high_exceptions.append(e)
        
        print(f"\nüìä T006: Exception Summary")
        print(f"   Critical: {len(self.critical_exceptions)}")
        print(f"   High: {len(self.high_exceptions)}")
        print(f"   Medium/Low: {len(self.exceptions) - len(self.critical_exceptions) - len(self.high_exceptions)}")
        
        return self
    
    def create_review_package(self):
        """Create automated review summary (no human pause)"""
        
        # Group critical exceptions by type
        critical_summary = {}
        for e in self.critical_exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in critical_summary:
                critical_summary[e_type] = {'count': 0, 'total_amount': 0, 'examples': []}
            
            critical_summary[e_type]['count'] += 1
            critical_summary[e_type]['total_amount'] += e.get('amount', 0)
            
            if len(critical_summary[e_type]['examples']) < 3:
                critical_summary[e_type]['examples'].append({
                    'transaction_id': e['transaction_id'],
                    'amount': e.get('amount', 0),
                    'description': e.get('description', '')
                })
        
        # Save review summary
        review_data = {
            'timestamp': datetime.now(),
            'total_critical': len(self.critical_exceptions),
            'total_high': len(self.high_exceptions),
            'critical_summary': critical_summary,
            'auto_approved': True,
            'note': 'Automated processing - no human review required'
        }
        
        # Save to file
        import json
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.json", 'w') as f:
            json.dump(review_data, f, indent=2, default=str)
        
        # Create a simple text summary
        with open(f"{Config.REPORTS_PATH}Exception_Review_Summary.txt", 'w') as f:
            f.write("EXCEPTION REVIEW SUMMARY (Automated)\n")
            f.write("="*50 + "\n\n")
            f.write(f"Review Date: {datetime.now()}\n")
            f.write(f"Status: AUTO-APPROVED\n\n")
            
            f.write(f"CRITICAL EXCEPTIONS: {len(self.critical_exceptions)}\n")
            for e_type, data in critical_summary.items():
                f.write(f"  ‚Ä¢ {e_type}: {data['count']} occurrences, ${data['total_amount']:,.2f}\n")
            
            f.write(f"\nHIGH EXCEPTIONS: {len(self.high_exceptions)}\n")
        
        print(f"   üíæ Saved review summary to {Config.REPORTS_PATH}Exception_Review_Summary.txt")
        
        return review_data
    
    def run(self):
        """Execute T006 steps"""
        print("\n" + "="*60)
        print("üöÄ T006: Reviewing High Severity Exceptions")
        print("="*60)
        print("   ‚ö° Automated mode - no human review required")
        
        self.categorize_exceptions()
        review_data = self.create_review_package()
        
        print(f"\n‚úÖ T006 Complete. Proceeding with pipeline.")
        
        return self.df, review_data


# ============================================================================
# T007: COMPUTE BUDGET VARIANCE (FIXED DIVISION BY ZERO)
# ============================================================================

class T007_BudgetVariance:
    """Task 7: Calculate actual vs budget variance"""
    
    def __init__(self, df):
        self.df = df.copy()
        self.budget_data = None
        self.variance_results = {}
        
    def load_budget(self):
        """Load budget data with proper column mapping"""
        print("\nüìÇ T007: Loading budget data...")
        
        try:
            self.budget_data = pd.read_csv(f"{Config.BUDGET_PATH}Budget_2026.csv")
            print(f"   Loaded budget data with {len(self.budget_data)} rows")
            
            # Standardize column names
            self.budget_data.columns = [col.lower().strip() for col in self.budget_data.columns]
            print(f"   Budget columns: {list(self.budget_data.columns)}")
            
            # Map period column
            period_col = None
            for col in ['fiscal_period', 'period', 'month', 'reporting_period']:
                if col in self.budget_data.columns:
                    period_col = col
                    break
            
            if period_col:
                self.budget_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                print(f"   ‚ö†Ô∏è No period column found, assuming all rows are for {Config.CURRENT_FISCAL_PERIOD}")
                self.budget_data['period'] = Config.CURRENT_FISCAL_PERIOD
            
            # Map account column
            account_col = None
            for col in ['account_code', 'account', 'gl_account', 'coa']:
                if col in self.budget_data.columns:
                    account_col = col
                    break
            
            if account_col:
                self.budget_data.rename(columns={account_col: 'account_code'}, inplace=True)
                print(f"   Using '{account_col}' as account column")
            
            # Map budget amount column
            budget_col = None
            for col in ['budget_amount_aud', 'budget_amount', 'budget', 'amount', 'planned_amount']:
                if col in self.budget_data.columns:
                    budget_col = col
                    break
            
            if budget_col:
                self.budget_data.rename(columns={budget_col: 'budget_amount'}, inplace=True)
                print(f"   Using '{budget_col}' as budget amount column")
                
                # Clean budget amounts (remove $, commas, etc.)
                self.budget_data['budget_amount'] = pd.to_numeric(
                    self.budget_data['budget_amount'].astype(str).str.replace('$', '').str.replace(',', ''),
                    errors='coerce'
                )
            else:
                print(f"   ‚ö†Ô∏è No budget amount column found, using synthetic data")
                self.budget_data['budget_amount'] = np.random.randint(50000, 200000, size=len(self.budget_data))
            
            # Ensure all key columns are string type for merging
            self.budget_data['period'] = self.budget_data['period'].astype(str)
            self.budget_data['account_code'] = self.budget_data['account_code'].astype(str)
            
            # Replace any zero or negative budget amounts with a small positive number to avoid division issues
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].replace(0, 0.01)
            self.budget_data['budget_amount'] = self.budget_data['budget_amount'].clip(lower=0.01)
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Budget data not found or error loading: {e}")
            # Create sample budget
            accounts = self.df['account_code_mapped'].dropna().unique() if 'account_code_mapped' in self.df.columns else ['5000']
            
            budget_rows = []
            for account in accounts[:30]:
                budget_rows.append({
                    'account_code': str(account),
                    'period': Config.CURRENT_FISCAL_PERIOD,
                    'budget_amount': np.random.randint(50000, 200000)
                })
            
            self.budget_data = pd.DataFrame(budget_rows)
            print(f"   Created sample budget for {len(self.budget_data)} accounts")
        
        return self
    
    def calculate_variance(self):
        """Calculate variance by account, cost center, and overall"""
        
        # Filter to current period only
        current_period_df = self.df[
            (self.df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
            (self.df['amount_aud'].notna())
        ].copy()
        
        print(f"   Processing {len(current_period_df)} transactions for {Config.CURRENT_FISCAL_PERIOD}")
        
        # 1. Variance by Account
        account_actuals = current_period_df.groupby('account_code_mapped').agg({
            'amount_aud': 'sum',
            'transaction_id': 'count'
        }).rename(columns={
            'amount_aud': 'actual_amount',
            'transaction_id': 'transaction_count'
        }).reset_index()
        
        # Convert account codes to string for merging
        account_actuals['account_code_mapped'] = account_actuals['account_code_mapped'].astype(str)
        
        # Get budget for current period
        feb_budget = self.budget_data[self.budget_data['period'] == Config.CURRENT_FISCAL_PERIOD].copy()
        
        if feb_budget.empty:
            print(f"   ‚ö†Ô∏è No budget found for period {Config.CURRENT_FISCAL_PERIOD}, using all budget data")
            feb_budget = self.budget_data.copy()
        
        # Ensure budget account codes are strings
        feb_budget['account_code'] = feb_budget['account_code'].astype(str)
        
        # Merge with budget
        if not account_actuals.empty and not feb_budget.empty:
            account_variance = pd.merge(
                account_actuals,
                feb_budget[['account_code', 'budget_amount']],
                left_on='account_code_mapped',
                right_on='account_code',
                how='outer'
            )
            
            account_variance['budget_amount'] = account_variance['budget_amount'].fillna(0.01)
            account_variance['actual_amount'] = account_variance['actual_amount'].fillna(0)
            account_variance['variance'] = account_variance['actual_amount'] - account_variance['budget_amount']
            
            # Safe variance percentage calculation (handle division by zero)
            def safe_variance_pct(row):
                if row['budget_amount'] > 0:
                    return (row['variance'] / row['budget_amount']) * 100
                elif row['actual_amount'] > 0:
                    # If budget is zero but there are actuals, it's infinite variance
                    return 999999  # Large number to indicate infinite
                else:
                    return 0
            
            account_variance['variance_pct'] = account_variance.apply(safe_variance_pct, axis=1)
            
            # Clean up columns
            account_variance = account_variance.drop(columns=['account_code'], errors='ignore')
            account_variance = account_variance.rename(columns={'account_code_mapped': 'account_code'})
        else:
            account_variance = pd.DataFrame()
        
        # 2. Variance by Cost Center
        if 'cost_center_mapped' in current_period_df.columns:
            cc_actuals = current_period_df.groupby('cost_center_mapped').agg({
                'amount_aud': 'sum',
                'transaction_id': 'count'
            }).rename(columns={
                'amount_aud': 'actual_amount',
                'transaction_id': 'transaction_count'
            }).reset_index()
            
            cc_actuals = cc_actuals[cc_actuals['cost_center_mapped'].notna()]
        else:
            cc_actuals = pd.DataFrame()
        
        # 3. Suspense amounts (invalid accounts)
        suspense_amount = current_period_df[
            current_period_df['account_code_mapped'].isna()
        ]['amount_aud'].sum()
        
        # 4. Future dated amounts
        current_date = datetime(Config.CURRENT_YEAR, Config.CURRENT_MONTH, 28)
        future_amount = current_period_df[
            current_period_df['posting_date'] > current_date
        ]['amount_aud'].sum()
        
        # 5. Total actual and budget
        total_actual = current_period_df['amount_aud'].sum()
        total_budget = feb_budget['budget_amount'].sum() if not feb_budget.empty else 0.01
        
        # Safe total variance calculation
        total_variance = total_actual - total_budget
        if total_budget > 0:
            total_variance_pct = (total_variance / total_budget) * 100
        elif total_actual > 0:
            total_variance_pct = 999999  # Infinite variance
        else:
            total_variance_pct = 0
        
        # Store results
        self.variance_results = {
            'by_account': account_variance.to_dict('records') if not account_variance.empty else [],
            'by_cost_center': cc_actuals.to_dict('records') if not cc_actuals.empty else [],
            'suspense_amount': suspense_amount,
            'future_dated_amount': future_amount,
            'total_actual': total_actual,
            'total_budget': total_budget,
            'total_variance': total_variance,
            'total_variance_pct': total_variance_pct,
            'transaction_count': len(current_period_df),
            'exception_count': current_period_df['has_exception'].sum() if 'has_exception' in current_period_df.columns else 0
        }
        
        print(f"\n   Variance Summary:")
        print(f"   Total Actual: ${total_actual:,.2f}")
        print(f"   Total Budget: ${total_budget:,.2f}")
        print(f"   Variance: ${total_variance:,.2f} ({total_variance_pct:.1f}%)")
        print(f"   Suspense (invalid accounts): ${suspense_amount:,.2f}")
        print(f"   Future dated: ${future_amount:,.2f}")
        
        return self
    
    def save_output(self):
        """Save variance results"""
        
        # Save detailed variance by account
        if self.variance_results['by_account']:
            pd.DataFrame(self.variance_results['by_account']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_Account.csv", index=False
            )
        
        # Save variance by cost center
        if self.variance_results['by_cost_center']:
            pd.DataFrame(self.variance_results['by_cost_center']).to_csv(
                f"{Config.REPORTS_PATH}Budget_Variance_By_CostCenter.csv", index=False
            )
        
        # Save summary
        summary_df = pd.DataFrame([{
            'metric': 'Total Actual',
            'value': self.variance_results['total_actual']
        }, {
            'metric': 'Total Budget',
            'value': self.variance_results['total_budget']
        }, {
            'metric': 'Variance',
            'value': self.variance_results['total_variance']
        }, {
            'metric': 'Variance %',
            'value': self.variance_results['total_variance_pct']
        }, {
            'metric': 'Suspense Amount',
            'value': self.variance_results['suspense_amount']
        }, {
            'metric': 'Future Dated Amount',
            'value': self.variance_results['future_dated_amount']
        }, {
            'metric': 'Transaction Count',
            'value': self.variance_results['transaction_count']
        }, {
            'metric': 'Exception Count',
            'value': self.variance_results['exception_count']
        }])
        
        summary_df.to_csv(f"{Config.REPORTS_PATH}Budget_Variance_Summary.csv", index=False)
        
        print(f"   üíæ Saved variance reports to {Config.REPORTS_PATH}")
        
        return self.variance_results
    
    def run(self):
        """Execute T007 steps"""
        print("\n" + "="*60)
        print("üöÄ T007: Computing Budget Variance")
        print("="*60)
        
        self.load_budget()
        self.calculate_variance()
        results = self.save_output()
        
        print(f"\n‚úÖ T007 Complete.")
        
        return results


# ============================================================================
# T010: FORECAST NEXT PERIOD (FIXED FOR YOUR KPI FILE)
# ============================================================================

class T010_Forecast:
    """Task 10: Generate forecast for next period based on historical trends"""
    
    def __init__(self, df, variance_results):
        self.df = df
        self.variance = variance_results
        self.historical_data = None
        self.forecast = {}
        
    def load_historical(self):
        """Load historical KPI data"""
        print("\nüìÇ T010: Loading historical data...")
        
        try:
            self.historical_data = pd.read_csv(f"{Config.REFERENCE_PATH}KPI_Monthly_History.csv")
            print(f"   Loaded {len(self.historical_data)} rows of historical data")
            print(f"   Original columns: {list(self.historical_data.columns)}")
            
            # Standardize column names to lowercase
            self.historical_data.columns = [col.lower().strip() for col in self.historical_data.columns]
            
            # Filter to get only Total_Expenses records
            expense_data = self.historical_data[
                self.historical_data['kpi_name'].str.contains('total_expenses|total_spend', case=False, na=False)
            ].copy()
            
            if expense_data.empty:
                print(f"   ‚ö†Ô∏è No Total_Expenses records found, using all KPI data")
                # Try to find any financial KPI
                expense_data = self.historical_data[
                    self.historical_data['category'].str.contains('financial', case=False, na=False)
                ].copy()
            
            if not expense_data.empty:
                print(f"   Found {len(expense_data)} expense records")
                
                # Rename columns for our use
                expense_data.rename(columns={
                    'fiscal_period': 'period',
                    'kpi_value': 'total_spend'
                }, inplace=True)
                
                # Ensure total_spend is numeric
                expense_data['total_spend'] = pd.to_numeric(expense_data['total_spend'], errors='coerce')
                
                # Sort by period
                expense_data = expense_data.sort_values('period')
                
                print(f"\n   Historical Monthly Expenses:")
                for _, row in expense_data.iterrows():
                    print(f"   ‚Ä¢ {row['period']}: ${row['total_spend']:,.2f}")
                
                self.historical_data = expense_data
            else:
                print(f"   ‚ö†Ô∏è No expense records found, creating synthetic data")
                self._create_synthetic_data()
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error loading historical data: {e}")
            self._create_synthetic_data()
        
        # Ensure period is string type for sorting
        self.historical_data['period'] = self.historical_data['period'].astype(str)
        
        return self
    
    def _create_synthetic_data(self):
        """Create synthetic historical data based on current actuals"""
        print("   Creating synthetic historical data...")
        
        months = []
        base_spend = self.variance.get('total_actual', 42000000)  # Your Feb actual ~42M
        base_count = self.variance.get('transaction_count', 1370)
        
        # Create realistic pattern based on your actual KPI structure
        # Your actual expenses grew from ~2.1M to ~2.5M over the year
        # But our current spend is 42M - this suggests your KPI file might be in thousands or different scale
        
        print(f"   ‚ö†Ô∏è Note: Your historical expenses (~$2.1M) are much lower than current actuals (~$42M)")
        print(f"   This suggests either:")
        print(f"   ‚Ä¢ The KPI file is in thousands (add 000)")
        print(f"   ‚Ä¢ Your business has grown significantly")
        print(f"   ‚Ä¢ We're comparing different metrics")
        
        # Scale factor based on ratio between current actual and latest historical
        latest_historical = 2523000  # Jan 2026 value
        scale_factor = base_spend / latest_historical if latest_historical > 0 else 1
        
        print(f"   Applying scale factor of {scale_factor:.1f}x to historical data")
        
        for i in range(1, 13):
            month_num = Config.CURRENT_MONTH - (12 - i)
            year = Config.CURRENT_YEAR
            if month_num <= 0:
                month_num += 12
                year -= 1
            
            month = f"{year}-{month_num:02d}"
            
            # Use actual pattern from your file but scaled
            if month == '2025-01':
                spend = 2145000 * scale_factor
            elif month == '2025-02':
                spend = 2198000 * scale_factor
            elif month == '2025-03':
                spend = 2267000 * scale_factor
            elif month == '2025-04':
                spend = 2189000 * scale_factor
            elif month == '2025-05':
                spend = 2234000 * scale_factor
            elif month == '2025-06':
                spend = 2312000 * scale_factor
            elif month == '2025-07':
                spend = 2278000 * scale_factor
            elif month == '2025-08':
                spend = 2345000 * scale_factor
            elif month == '2025-09':
                spend = 2289000 * scale_factor
            elif month == '2025-10':
                spend = 2401000 * scale_factor
            elif month == '2025-11':
                spend = 2367000 * scale_factor
            elif month == '2025-12':
                spend = 2456000 * scale_factor
            elif month == '2026-01':
                spend = 2523000 * scale_factor
            else:
                spend = base_spend * (0.9 + 0.2 * (i / 12))
            
            months.append({
                'period': month,
                'total_spend': spend,
                'transaction_count': int(base_count * (spend / base_spend))
            })
        
        self.historical_data = pd.DataFrame(months)
        print(f"   Created synthetic history for {len(self.historical_data)} months")
        print("\n   Scaled Historical Monthly Expenses:")
        for _, row in self.historical_data.iterrows():
            print(f"   ‚Ä¢ {row['period']}: ${row['total_spend']:,.2f}")
    
    def calculate_trends(self):
        """Calculate trends from historical data"""
        
        # Sort by period
        try:
            self.historical_data = self.historical_data.sort_values('period')
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error sorting by period: {e}")
            pass
        
        # Calculate moving averages
        if len(self.historical_data) >= 3:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend'].rolling(3, min_periods=1).mean()
        else:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend']
        
        # Calculate growth rate
        if len(self.historical_data) >= 2:
            self.historical_data['growth_rate'] = self.historical_data['total_spend'].pct_change()
            avg_growth = self.historical_data['growth_rate'].mean()
            # Handle NaN
            if pd.isna(avg_growth):
                avg_growth = 0.02
        else:
            avg_growth = 0.02  # Default 2% growth
        
        # Recent trend (last 3 months)
        recent_data = self.historical_data.tail(min(3, len(self.historical_data)))
        recent_avg = recent_data['total_spend'].mean()
        
        if len(recent_data) >= 2:
            recent_growth = recent_data['growth_rate'].mean()
        else:
            recent_growth = avg_growth
        
        # Seasonal adjustment (if we have same month last year)
        current_month_str = f"{Config.CURRENT_MONTH:02d}"
        last_year_data = self.historical_data[
            self.historical_data['period'].str.endswith(current_month_str)
        ]
        
        if not last_year_data.empty and recent_avg > 0:
            seasonal_factor = last_year_data['total_spend'].iloc[0] / recent_avg
        else:
            seasonal_factor = 1.0
        
        # Calculate forecast for next period
        if Config.CURRENT_MONTH < 12:
            next_period = f"{Config.CURRENT_YEAR}-{Config.CURRENT_MONTH+1:02d}"
            next_month_num = Config.CURRENT_MONTH + 1
            next_year = Config.CURRENT_YEAR
        else:
            next_period = f"{Config.CURRENT_YEAR+1}-01"
            next_month_num = 1
            next_year = Config.CURRENT_YEAR + 1
        
        # Base forecast on recent average with growth and seasonal adjustment
        base_forecast = recent_avg * (1 + recent_growth) * seasonal_factor
        
        # Adjust based on current month actual
        current_actual = self.variance.get('total_actual', base_forecast)
        recent_avg = recent_avg if recent_avg > 0 else current_actual
        
        # Blend current and historical (70% recent trend, 30% current month with growth)
        blended_forecast = 0.7 * base_forecast + 0.3 * current_actual * 1.05  # Assume 5% growth
        
        # Calculate confidence interval
        if len(self.historical_data) > 1:
            std_dev = self.historical_data['total_spend'].std()
            margin = 1.96 * std_dev / np.sqrt(len(self.historical_data))
        else:
            std_dev = blended_forecast * 0.1
            margin = blended_forecast * 0.2
        
        lower_bound = max(0, blended_forecast - margin)
        upper_bound = blended_forecast + margin
        
        self.forecast = {
            'next_period': next_period,
            'next_month': next_month_num,
            'next_year': next_year,
            'forecast_amount': blended_forecast,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'confidence_level': 0.95,
            'method': 'Blended (70% trend, 30% current)',
            'historical_months_used': len(self.historical_data),
            'avg_growth_rate': avg_growth,
            'seasonal_factor': seasonal_factor,
            'current_actual': current_actual,
            'recent_avg': recent_avg
        }
        
        print(f"\n   üìà Forecast Analysis:")
        print(f"   Historical months used: {len(self.historical_data)}")
        print(f"   Average growth rate: {avg_growth*100:.1f}%")
        print(f"   Seasonal factor: {seasonal_factor:.2f}")
        print(f"\n   Forecast for {next_period}:")
        print(f"   ‚Ä¢ Point forecast: ${self.forecast['forecast_amount']:,.2f}")
        print(f"   ‚Ä¢ 95% CI: (${self.forecast['lower_bound']:,.2f} - ${self.forecast['upper_bound']:,.2f})")
        
        return self
    
    def save_forecast(self):
        """Save forecast results"""
        
        # Save as CSV
        forecast_df = pd.DataFrame([self.forecast])
        forecast_df.to_csv(f"{Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv", index=False)
        
        # Save detailed forecast with account-level breakdown
        if 'by_account' in self.variance and self.variance['by_account']:
            account_proportions = []
            total_actual = self.variance.get('total_actual', 0)
            
            if total_actual > 0:
                for a in self.variance['by_account']:
                    if a.get('actual_amount', 0) > 0:
                        proportion = a['actual_amount'] / total_actual
                        account_proportions.append({
                            'account_code': a.get('account_code_mapped', a.get('account_code', 'UNKNOWN')),
                            'account_description': a.get('account_description', 'Unknown'),
                            'current_actual': a['actual_amount'],
                            'forecast_proportion': proportion,
                            'forecast_amount': proportion * self.forecast['forecast_amount']
                        })
                
                if account_proportions:
                    pd.DataFrame(account_proportions).to_csv(
                        f"{Config.REPORTS_PATH}Forecast_By_Account_{self.forecast['next_period'].replace('-', '')}.csv", 
                        index=False
                    )
        
        print(f"\n   üíæ Saved forecast to {Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv")
        
        return self.forecast
    
    def run(self):
        """Execute T010 steps"""
        print("\n" + "="*60)
        print("üöÄ T010: Forecasting Next Period")
        print("="*60)
        
        self.load_historical()
        self.calculate_trends()
        forecast = self.save_forecast()
        
        print(f"\n‚úÖ T010 Complete.")
        
        return forecast


# ============================================================================
# T009: GENERATE EXECUTIVE NARRATIVE (Rule-based, no LLM)
# ============================================================================

class T009_ExecutiveNarrative:
    """Task 9: Create natural language summary (rule-based, no LLM)"""
    
    def __init__(self, variance_results, report_data, exceptions):
        self.variance = variance_results
        self.report = report_data
        self.exceptions = exceptions
        self.narrative = ""
        
    def generate_narrative(self):
        """Generate narrative using templates and rules"""
        print("\nüìù T009: Generating Executive Narrative")
        
        lines = []
        
        # Header
        lines.append("="*80)
        lines.append(f"EXECUTIVE NARRATIVE - {Config.CURRENT_FISCAL_PERIOD}")
        lines.append("="*80)
        lines.append("")
        
        # Financial Summary
        lines.append("FINANCIAL SUMMARY")
        lines.append("-"*40)
        
        variance_pct = self.variance['total_variance_pct']
        if abs(variance_pct) < 2:
            variance_desc = "in line with"
        elif variance_pct > 0:
            if variance_pct > 10:
                variance_desc = "significantly above"
            else:
                variance_desc = "moderately above"
        else:
            if variance_pct < -10:
                variance_desc = "significantly below"
            else:
                variance_desc = "moderately below"
        
        lines.append(f"Total spend for {Config.CURRENT_FISCAL_PERIOD} was ${self.variance['total_actual']:,.2f}, "
                    f"which is {variance_desc} budget of ${self.variance['total_budget']:,.2f}. "
                    f"The variance is ${abs(self.variance['total_variance']):,.2f} ({variance_pct:.1f}%).")
        lines.append("")
        
        # Key Drivers
        lines.append("KEY VARIANCE DRIVERS")
        lines.append("-"*40)
        
        # Find largest variances from account data
        account_variances = self.variance['by_account']
        top_pos = sorted([a for a in account_variances if a.get('variance', 0) > 0], 
                         key=lambda x: x['variance'], reverse=True)[:3]
        top_neg = sorted([a for a in account_variances if a.get('variance', 0) < 0], 
                         key=lambda x: x['variance'])[:3]
        
        if top_pos:
            lines.append("Positive variances (over budget):")
            for a in top_pos:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: +${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        
        if top_neg:
            lines.append("Negative variances (under budget):")
            for a in top_neg:
                lines.append(f"  ‚Ä¢ {a.get('account_code', 'Unknown')}: ${a['variance']:,.2f} ({a['variance_pct']:.1f}%)")
        lines.append("")
        
        # Exception Summary
        lines.append("EXCEPTION SUMMARY")
        lines.append("-"*40)
        
        critical_count = len([e for e in self.exceptions if e.get('severity') == 'CRITICAL'])
        high_count = len([e for e in self.exceptions if e.get('severity') == 'HIGH'])
        medium_count = len([e for e in self.exceptions if e.get('severity') == 'MEDIUM'])
        
        lines.append(f"Total exceptions: {len(self.exceptions)}")
        lines.append(f"  ‚Ä¢ Critical: {critical_count}")
        lines.append(f"  ‚Ä¢ High: {high_count}")
        lines.append(f"  ‚Ä¢ Medium: {medium_count}")
        
        # Top exception types
        exception_types = {}
        for e in self.exceptions:
            e_type = e.get('anomaly_type', e.get('rule_id', 'UNKNOWN'))
            if e_type not in exception_types:
                exception_types[e_type] = 0
            exception_types[e_type] += 1
        
        top_types = sorted(exception_types.items(), key=lambda x: x[1], reverse=True)[:3]
        if top_types:
            lines.append("\nMost common exceptions:")
            for e_type, count in top_types:
                lines.append(f"  ‚Ä¢ {e_type}: {count} occurrences")
        lines.append("")
        
        # Data Quality Impact
        lines.append("DATA QUALITY IMPACT")
        lines.append("-"*40)
        
        suspense_amount = self.variance.get('suspense_amount', 0)
        future_amount = self.variance.get('future_dated_amount', 0)
        total_impact = suspense_amount + future_amount
        impact_pct = (total_impact / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Transactions with data quality issues: ${total_impact:,.2f} ({impact_pct:.1f}% of total)")
        if suspense_amount > 0:
            lines.append(f"  ‚Ä¢ Invalid accounts (in suspense): ${suspense_amount:,.2f}")
        if future_amount > 0:
            lines.append(f"  ‚Ä¢ Future-dated transactions: ${future_amount:,.2f}")
        lines.append("")
        
        # Currency Impact
        lines.append("CURRENCY EXPOSURE")
        lines.append("-"*40)
        
        non_aud_total = sum(c['amount_aud'] for c in self.report['currency_summary'] 
                           if c['currency_code'] != 'AUD')
        non_aud_pct = (non_aud_total / self.variance['total_actual'] * 100) if self.variance['total_actual'] > 0 else 0
        
        lines.append(f"Foreign currency exposure: ${non_aud_total:,.2f} ({non_aud_pct:.1f}% of total)")
        
        # Top non-AUD currencies
        for c in self.report['currency_summary']:
            if c['currency_code'] != 'AUD' and c['amount_aud'] > 0:
                lines.append(f"  ‚Ä¢ {c['currency_code']}: ${c['amount_aud']:,.2f}")
        lines.append("")
        
        # Recommendations
        lines.append("RECOMMENDATIONS")
        lines.append("-"*40)
        
        if suspense_amount > 10000:
            lines.append("‚Ä¢ Review and remap transactions with invalid account codes")
        if future_amount > 10000:
            lines.append("‚Ä¢ Reclassify future-dated transactions to correct period")
        if critical_count > 0:
            lines.append("‚Ä¢ Investigate critical exceptions before next close")
        if len(self.exceptions) > 100:
            lines.append("‚Ä¢ Schedule data quality workshop to address root causes")
        
        # Join all lines
        self.narrative = "\n".join(lines)
        
        print(f"   Generated {len(lines)} lines of narrative")
        return self
    
    def save_narrative(self):
        """Save narrative to file"""
        with open(f"{Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt", 'w') as f:
            f.write(self.narrative)
        
        print(f"   üíæ Saved narrative to {Config.REPORTS_PATH}Executive_Narrative_Feb2026.txt")
        
        return self.narrative
    
    def run(self):
        """Execute T009 steps"""
        print("\n" + "="*60)
        print("üöÄ T009: Generating Executive Narrative")
        print("="*60)
        
        self.generate_narrative()
        narrative = self.save_narrative()
        
        print(f"\n‚úÖ T009 Complete.")
        
        return narrative


# ============================================================================
# T010: FORECAST NEXT PERIOD
# ============================================================================

# ============================================================================
# T010: FORECAST NEXT PERIOD (FIXED)
# ============================================================================

class T010_Forecast:
    """Task 10: Generate forecast for next period based on historical trends"""
    
    def __init__(self, df, variance_results):
        self.df = df
        self.variance = variance_results
        self.historical_data = None
        self.forecast = {}
        
    def load_historical(self):
        """Load historical KPI data"""
        print("\nüìÇ T010: Loading historical data...")
        
        try:
            self.historical_data = pd.read_csv(f"{Config.REFERENCE_PATH}KPI_Monthly_History.csv")
            print(f"   Loaded {len(self.historical_data)} rows of historical data")
            
            # Standardize column names
            self.historical_data.columns = [col.lower().strip() for col in self.historical_data.columns]
            
            # Check for period column and rename if needed
            period_col = None
            for col in ['period', 'month', 'fiscal_period', 'reporting_period', 'date', 'year_month']:
                if col in self.historical_data.columns:
                    period_col = col
                    break
            
            if period_col:
                if period_col != 'period':
                    self.historical_data.rename(columns={period_col: 'period'}, inplace=True)
                print(f"   Using '{period_col}' as period column")
            else:
                # Create a synthetic period column if none exists
                print(f"   ‚ö†Ô∏è No period column found, creating synthetic periods")
                self.historical_data['period'] = [f"2025-{i:02d}" for i in range(1, len(self.historical_data) + 1)]
            
            # Check for spend column and rename if needed
            spend_col = None
            for col in ['total_spend', 'spend', 'amount', 'actual', 'value', 'total']:
                if col in self.historical_data.columns:
                    spend_col = col
                    break
            
            if spend_col:
                if spend_col != 'total_spend':
                    self.historical_data.rename(columns={spend_col: 'total_spend'}, inplace=True)
                print(f"   Using '{spend_col}' as spend column")
            else:
                # Create synthetic spend data
                print(f"   ‚ö†Ô∏è No spend column found, creating synthetic data")
                base_spend = self.variance.get('total_actual', 1000000)
                self.historical_data['total_spend'] = [
                    base_spend * (0.8 + 0.4 * np.random.random()) 
                    for _ in range(len(self.historical_data))
                ]
            
            print(f"   Historical data columns: {list(self.historical_data.columns)}")
            
        except Exception as e:
            print(f"   ‚ö†Ô∏è Historical data not found or error loading: {e}")
            # Create synthetic history from current data
            months = []
            base_spend = self.variance.get('total_actual', 1000000)
            base_count = self.variance.get('transaction_count', 1000)
            
            for i in range(1, 13):
                month_num = Config.CURRENT_MONTH - (12 - i)
                year = Config.CURRENT_YEAR
                if month_num <= 0:
                    month_num += 12
                    year -= 1
                
                month = f"{year}-{month_num:02d}"
                months.append({
                    'period': month,
                    'total_spend': base_spend * (0.8 + 0.4 * np.random.random()),
                    'transaction_count': int(base_count * (0.8 + 0.4 * np.random.random()))
                })
            self.historical_data = pd.DataFrame(months)
            print(f"   Created synthetic historical data for {len(self.historical_data)} months")
        
        # Ensure period is string type for sorting
        self.historical_data['period'] = self.historical_data['period'].astype(str)
        
        return self
    
    def calculate_trends(self):
        """Calculate trends from historical data"""
        
        # Sort by period
        try:
            self.historical_data = self.historical_data.sort_values('period')
        except Exception as e:
            print(f"   ‚ö†Ô∏è Error sorting by period: {e}")
            # If sorting fails, assume data is already in order
            pass
        
        # Calculate moving averages
        if len(self.historical_data) >= 3:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend'].rolling(3, min_periods=1).mean()
        else:
            self.historical_data['spend_ma_3'] = self.historical_data['total_spend']
        
        # Calculate growth rate
        if len(self.historical_data) >= 2:
            self.historical_data['growth_rate'] = self.historical_data['total_spend'].pct_change()
            avg_growth = self.historical_data['growth_rate'].mean()
            # Handle NaN
            if pd.isna(avg_growth):
                avg_growth = 0.02
        else:
            avg_growth = 0.02  # Default 2% growth
        
        # Recent trend (last 3 months)
        recent_data = self.historical_data.tail(min(3, len(self.historical_data)))
        recent_avg = recent_data['total_spend'].mean()
        
        if len(recent_data) >= 2:
            recent_growth = recent_data['growth_rate'].mean()
        else:
            recent_growth = avg_growth
        
        # Seasonal adjustment (if we have same month last year)
        current_month_str = f"{Config.CURRENT_MONTH:02d}"
        last_year_data = self.historical_data[
            self.historical_data['period'].str.endswith(current_month_str)
        ]
        
        if not last_year_data.empty and recent_avg > 0:
            seasonal_factor = last_year_data['total_spend'].iloc[0] / recent_avg
        else:
            seasonal_factor = 1.0
        
        # Calculate forecast for next period
        if Config.CURRENT_MONTH < 12:
            next_period = f"{Config.CURRENT_YEAR}-{Config.CURRENT_MONTH+1:02d}"
            next_month_num = Config.CURRENT_MONTH + 1
            next_year = Config.CURRENT_YEAR
        else:
            next_period = f"{Config.CURRENT_YEAR+1}-01"
            next_month_num = 1
            next_year = Config.CURRENT_YEAR + 1
        
        # Base forecast on recent average with growth and seasonal adjustment
        base_forecast = recent_avg * (1 + recent_growth) * seasonal_factor
        
        # Adjust based on current month actual
        current_actual = self.variance.get('total_actual', base_forecast)
        recent_avg = recent_avg if recent_avg > 0 else current_actual
        
        # Blend current and historical (70% recent trend, 30% current month with growth)
        blended_forecast = 0.7 * base_forecast + 0.3 * current_actual * 1.05  # Assume 5% growth
        
        # Calculate confidence interval
        if len(self.historical_data) > 1:
            std_dev = self.historical_data['total_spend'].std()
            margin = 1.96 * std_dev / np.sqrt(len(self.historical_data))
        else:
            std_dev = blended_forecast * 0.1
            margin = blended_forecast * 0.2
        
        lower_bound = max(0, blended_forecast - margin)
        upper_bound = blended_forecast + margin
        
        self.forecast = {
            'next_period': next_period,
            'next_month': next_month_num,
            'next_year': next_year,
            'forecast_amount': blended_forecast,
            'lower_bound': lower_bound,
            'upper_bound': upper_bound,
            'confidence_level': 0.95,
            'method': 'Blended (70% trend, 30% current)',
            'historical_months_used': len(self.historical_data),
            'avg_growth_rate': avg_growth,
            'seasonal_factor': seasonal_factor,
            'current_actual': current_actual,
            'recent_avg': recent_avg
        }
        
        print(f"\n   Forecast for {next_period}:")
        print(f"   Point forecast: ${self.forecast['forecast_amount']:,.2f}")
        print(f"   95% CI: (${self.forecast['lower_bound']:,.2f} - ${self.forecast['upper_bound']:,.2f})")
        
        return self
    
    def save_forecast(self):
        """Save forecast results"""
        
        # Save as CSV
        forecast_df = pd.DataFrame([self.forecast])
        forecast_df.to_csv(f"{Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv", index=False)
        
        # Save detailed forecast with account-level breakdown
        if 'by_account' in self.variance and self.variance['by_account']:
            account_proportions = []
            total_actual = self.variance.get('total_actual', 0)
            
            if total_actual > 0:
                for a in self.variance['by_account']:
                    if a.get('actual_amount', 0) > 0:
                        proportion = a['actual_amount'] / total_actual
                        account_proportions.append({
                            'account_code': a.get('account_code_mapped', a.get('account_code', 'UNKNOWN')),
                            'account_description': a.get('account_description', 'Unknown'),
                            'current_actual': a['actual_amount'],
                            'forecast_proportion': proportion,
                            'forecast_amount': proportion * self.forecast['forecast_amount']
                        })
                
                if account_proportions:
                    pd.DataFrame(account_proportions).to_csv(
                        f"{Config.REPORTS_PATH}Forecast_By_Account_{self.forecast['next_period'].replace('-', '')}.csv", 
                        index=False
                    )
        
        print(f"   üíæ Saved forecast to {Config.REPORTS_PATH}Forecast_{self.forecast['next_period'].replace('-', '')}.csv")
        
        return self.forecast
    
    def run(self):
        """Execute T010 steps"""
        print("\n" + "="*60)
        print("üöÄ T010: Forecasting Next Period")
        print("="*60)
        
        self.load_historical()
        self.calculate_trends()
        forecast = self.save_forecast()
        
        print(f"\n‚úÖ T010 Complete.")
        
        return forecast
    

# Add this class before the main pipeline

# ============================================================================
# IMPROVED DATA VALIDATOR (FIXED MESSAGE)
# ============================================================================

class DataValidator:
    """Validate that all required data files exist and are properly formatted"""
    
    @staticmethod
    def validate_all():
        """Run all validations"""
        issues = []
        
        # Check master data files
        required_files = {
            f"{Config.MASTER_DATA_PATH}Master_COA.csv": "Chart of Accounts",
            f"{Config.MASTER_DATA_PATH}Master_Entity.csv": "Entity Master",
            f"{Config.MASTER_DATA_PATH}Master_CostCenters.csv": "Cost Center Master",
            f"{Config.BUDGET_PATH}Budget_2026.csv": "Budget Data"
        }
        
        print("\nüìä DATA VALIDATION")
        print("-" * 40)
        
        for filepath, description in required_files.items():
            if not os.path.exists(filepath):
                issues.append(f"‚ùå Missing {description}: {filepath}")
            else:
                try:
                    df = pd.read_csv(filepath)
                    print(f"‚úÖ {description}: {len(df)} rows")
                    print(f"   Columns: {list(df.columns)}")
                    
                    # Special checks for Master_COA.csv
                    if "Master_COA.csv" in filepath:
                        # Check for account code column variations
                        possible_cols = ['Account_Code', 'account_code', 'AccountCode', 'Account', 'CODE']
                        found_col = None
                        for col in possible_cols:
                            if col in df.columns:
                                print(f"   ‚úì Found account code column: '{col}'")
                                found_col = col
                                break
                        if not found_col:
                            issues.append(f"   ‚ùå No account code column found in {filepath}. Found: {list(df.columns)}")
                            
                except Exception as e:
                    issues.append(f"‚ùå Cannot read {description}: {e}")
        
        if issues:
            print("\n‚ö†Ô∏è DATA VALIDATION ISSUES FOUND:")
            for issue in issues:
                print(issue)
            print("\n‚úÖ Pipeline will continue but may use synthetic data where needed.\n")
            return False
        else:
            print("\n‚úÖ All master data files validated successfully.\n")
            return True


# ============================================================================
# MAIN PIPELINE EXECUTION (WITH BUDGET ANALYSIS)
# ============================================================================

class FinancialCloseAgent:
    """Main agent orchestrating all tasks"""
    
    def __init__(self):
        self.results = {}
        self.start_time = datetime.now()
        
    def run_pipeline(self):
        """Execute all tasks in sequence"""
        print("\n" + "="*80)
        print("üöÄ FINANCIAL CLOSE AGENT PIPELINE")
        print(f"   Started: {self.start_time}")
        print("="*80 + "\n")

        # Validate data files
        validator = DataValidator()
        validator.validate_all()
        
        # Task 001: Wrangle Raw Data
        wrangler = T001_DataWrangler()
        df, anomalies = wrangler.run(Config.RAW_DATA_PATH)
        self.results['df_t001'] = df
        self.results['anomalies'] = anomalies
        
        # Task 002: Map Entities and Accounts
        mapper = T002_EntityAccountMapper(df)
        df = mapper.run()
        self.results['df_t002'] = df
        
        # Task 003: Resolve Vendors
        resolver = T003_VendorResolver(df)
        df = resolver.run()
        self.results['df_t003'] = df
        
        # Task 004: FX Conversion
        converter = T004_FXConverter(df)
        df = converter.run()
        self.results['df_t004'] = df
        
        # Task 005: Detect Exceptions
        detector = T005_ExceptionDetector(df)
        df, exceptions = detector.run()
        self.results['df_t005'] = df
        self.results['exceptions'] = exceptions
        
        # Task 006: Review Exceptions (Automated)
        reviewer = T006_ExceptionReviewer(df, exceptions)
        df, review = reviewer.run()
        self.results['df_t006'] = df
        self.results['review'] = review
        
        # Task 007: Budget Variance
        variance = T007_BudgetVariance(df)
        variance_results = variance.run()
        self.results['variance'] = variance_results
        self.results['budget_data'] = variance.budget_data  # Store budget data for analysis
        
        # Add budget coverage analysis
        self.analyze_budget_coverage(df, variance.budget_data)
        
        # Task 008: Close Pack Report
        report = T008_ClosePackReport(df, variance_results, exceptions)
        report_data = report.run()
        self.results['report'] = report_data
        
        # Task 009: Executive Narrative
        narrative = T009_ExecutiveNarrative(variance_results, report_data, exceptions)
        narrative_text = narrative.run()
        self.results['narrative'] = narrative_text
        
        # Task 010: Forecast
        forecast = T010_Forecast(df, variance_results)
        forecast_data = forecast.run()
        self.results['forecast'] = forecast_data
        
        # Completion
        end_time = datetime.now()
        duration = (end_time - self.start_time).total_seconds()
        
        print("\n" + "="*80)
        print("‚úÖ PIPELINE COMPLETE")
        print(f"   Finished: {end_time}")
        print(f"   Duration: {duration:.2f} seconds")
        print("="*80)
        
        return self.results
    
    def analyze_budget_coverage(self, df, budget_data):
        """Analyze budget coverage and identify gaps"""
        print("\n" + "="*60)
        print("üìä BUDGET COVERAGE ANALYSIS")
        print("="*60)
        
        if budget_data is None or budget_data.empty:
            print("‚ö†Ô∏è No budget data available for analysis")
            return
        
        # Get unique accounts with activity in current period
        active_accounts = df[
            (df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) & 
            (df['account_code_mapped'].notna())
        ]['account_code_mapped'].unique()
        
        print(f"Active accounts in {Config.CURRENT_FISCAL_PERIOD}: {len(active_accounts)}")
        
        # Get accounts with budget in current period
        budget_accounts = budget_data[
            budget_data['period'] == Config.CURRENT_FISCAL_PERIOD
        ]['account_code'].unique()
        
        print(f"Accounts with budget: {len(budget_accounts)}")
        
        # Find accounts missing budget
        missing_budget = set(active_accounts) - set(budget_accounts)
        if missing_budget:
            print(f"\n‚ö†Ô∏è {len(missing_budget)} active accounts have no budget:")
            # Show sample of missing accounts
            sample_missing = list(missing_budget)[:10]
            print(f"   Sample: {sample_missing}")
            
            # Calculate total spend in missing budget accounts
            missing_spend = df[
                (df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD) &
                (df['account_code_mapped'].isin(missing_budget))
            ]['amount_aud'].sum()
            
            print(f"   Total spend in unbudgeted accounts: ${missing_spend:,.2f}")
            print(f"   This represents {missing_spend/self.results['variance']['total_actual']*100:.1f}% of total spend")
        else:
            print("\n‚úÖ All active accounts have budget assigned")
        
        # Find budgeted accounts with no activity
        inactive_budget = set(budget_accounts) - set(active_accounts)
        if inactive_budget:
            print(f"\n‚ÑπÔ∏è {len(inactive_budget)} budgeted accounts have no activity:")
            sample_inactive = list(inactive_budget)[:10]
            print(f"   Sample: {sample_inactive}")
        
        print("\n" + "="*60)


# ============================================================================
# EXECUTE THE PIPELINE
# ============================================================================

if __name__ == "__main__":
    # Create directories if they don't exist
    for path in [Config.OUTPUT_PATH, Config.REPORTS_PATH]:
        os.makedirs(path, exist_ok=True)
    
    # Run the agent
    agent = FinancialCloseAgent()
    results = agent.run_pipeline()
    
    # Print final summary
    print("\n" + "="*80)
    print("üìä FINAL SUMMARY")
    print("="*80)
    print(f"Total transactions processed: {len(results['df_t001'])}")
    print(f"Total exceptions found: {len(results['exceptions'])}")
    print(f"Critical exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'CRITICAL'])}")
    print(f"High exceptions: {len([e for e in results['exceptions'] if e.get('severity') == 'HIGH'])}")
    print(f"Total spend: ${results['variance']['total_actual']:,.2f}")
    print(f"Budget variance: ${results['variance']['total_variance']:,.2f} ({results['variance']['total_variance_pct']:.1f}%)")
    print(f"Suspense amount (invalid accounts): ${results['variance']['suspense_amount']:,.2f}")
    print(f"Forecast for next period: ${results['forecast']['forecast_amount']:,.2f}")
    
    # Add budget coverage summary to final output
    if 'budget_data' in results and results['budget_data'] is not None:
        budget_data = results['budget_data']
        df = results['df_t006']
        
        active_accounts = df[df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD]['account_code_mapped'].dropna().nunique()
        budget_accounts = budget_data[budget_data['period'] == Config.CURRENT_FISCAL_PERIOD]['account_code'].nunique()
        
        print(f"\nüìä BUDGET COVERAGE:")
        print(f"   Active accounts with budget: {len(set(
            df[df['fiscal_period'] == Config.CURRENT_FISCAL_PERIOD]['account_code_mapped'].dropna().unique()
        ) & set(
            budget_data[budget_data['period'] == Config.CURRENT_FISCAL_PERIOD]['account_code'].unique()
        ))}")
        print(f"   Active accounts without budget: {active_accounts - budget_accounts if active_accounts > budget_accounts else 0}")
    
    print("\nOutput files saved to:")
    print(f"  ‚Ä¢ Working data: {Config.OUTPUT_PATH}")
    print(f"  ‚Ä¢ Reports: {Config.REPORTS_PATH}")
    print("="*80)


üöÄ FINANCIAL CLOSE AGENT PIPELINE
   Started: 2026-02-22 23:27:14.000803


üìä DATA VALIDATION
----------------------------------------
‚úÖ Chart of Accounts: 28 rows
   Columns: ['Account_Code', 'Account_Name', 'Account_Type', 'Category', 'Active']
   ‚úì Found account code column: 'Account_Code'
‚úÖ Entity Master: 1 rows
   Columns: ['Entity', 'Entity_Name', 'Country', 'Currency', 'Active']
‚úÖ Cost Center Master: 10 rows
   Columns: ['Cost_Center', 'Cost_Center_Name', 'Department', 'Manager', 'Active']
‚úÖ Budget Data: 60 rows
   Columns: ['Fiscal_Period', 'Entity', 'Account_Code', 'Cost_Center', 'Budget_Amount_AUD', 'Budget_Type', 'Notes']

‚úÖ All master data files validated successfully.


üöÄ T001: Wrangling Raw GL Data
üìÇ T001: Loading raw GL data...
   Loaded 4080 rows
   ‚úì Column names standardized
   ‚úì Dates standardized. Invalid dates: 48
   ‚úì Amounts cleaned. Negative amounts: 96
   ‚úì Embedded exceptions detected: 0
   üíæ Saved 4080 rows to working/GL_Stan