In [7]:
import pandas as pd
import json
import os
from datetime import datetime

print("LAYER 3: SOVEREIGN LOGIC - VALIDATION")
print("=" * 60)

class DataValidator:
    def __init__(self):
        self.input_file = "data_standardized/all_standardized.csv"
        self.rules_file = "validation_rules.json"
        self.output_dir = "reports"
        
        os.makedirs(self.output_dir, exist_ok=True)
        
        # Load data
        self.df = pd.read_csv(self.input_file)
        print(f"Loaded {len(self.df)} records from standardized data")
        
        # Load validation rules
        with open(self.rules_file, 'r') as f:
            self.rules = json.load(f)
        
        self.results = []
        self.accepted = []
        self.rejected = []
    
    def validate_date_within_program(self, record):
        "Check if reporting date is within program period."
        program_start = self.rules['business_rules']['program_period']['start']
        program_end = self.rules['business_rules']['program_period']['end']
        
        if not (program_start <= record['reporting_month'] <= program_end):
            return False, f"Date {record['reporting_month']} outside program period ({program_start} to {program_end})"
        return True, "OK"
    
    def validate_partner_approved(self, record):
        "Check if partner is in approved list."
        approved = self.rules['business_rules']['approved_partners']
        
        if record['partner_id'] not in approved:
            return False, f"Partner {record['partner_id']} not in approved list"
        return True, "OK"
    
    def validate_households_positive(self, record):
        "Check households are positive and within limits."
        min_val = self.rules['business_rules']['thresholds']['min_households']
        max_val = self.rules['business_rules']['thresholds']['max_households']
        
        if record['households_supported'] < min_val:
            return False, f"Households ({record['households_supported']}) below minimum ({min_val})"
        if record['households_supported'] > max_val:
            return False, f"Households ({record['households_supported']}) above maximum ({max_val})"
        return True, "OK"
    
    def validate_amount_positive(self, record):
        "Check amount is positive and within limits."
        min_val = self.rules['business_rules']['thresholds']['min_amount']
        max_val = self.rules['business_rules']['thresholds']['max_amount']
        
        if record['amount_disbursed_usd'] < min_val:
            return False, f"Amount ({record['amount_disbursed_usd']}) below minimum ({min_val})"
        if record['amount_disbursed_usd'] > max_val:
            return False, f"Amount ({record['amount_disbursed_usd']}) above maximum ({max_val})"
        return True, "OK"
    
    def check_duplicates(self):
        "Check for duplicate partner-month combinations."
        duplicate_mask = self.df.duplicated(subset=['partner_id', 'reporting_month'], keep=False)
        duplicates = self.df[duplicate_mask].copy()
        
        if len(duplicates) > 0:
            # Mark duplicates as rejected
            for idx, row in duplicates.iterrows():
                self.rejected.append({
                    'record': row.to_dict(),
                    'rejection_reason': f"Duplicate record for partner {row['partner_id']} in {row['reporting_month']}"
                })
            
            # Keep only non-duplicates
            self.df = self.df[~duplicate_mask]
            print(f"Found {len(duplicates)} duplicate records")
    
    def validate_all(self):
        "Run all validation checks on each record."
        print("\n Running validation checks")
        
        # First check for duplicates
        self.check_duplicates()
        
        # Validate each record
        for idx, row in self.df.iterrows():
            record = row.to_dict()
            validation_errors = []
            
            # Run each validation check
            checks = [
                ("Date within program", self.validate_date_within_program),
                ("Partner approved", self.validate_partner_approved),
                ("Households positive", self.validate_households_positive),
                ("Amount positive", self.validate_amount_positive)
            ]
            
            for check_name, check_func in checks:
                is_valid, message = check_func(record)
                if not is_valid:
                    validation_errors.append(f"{check_name}: {message}")
            
            # Record result
            if validation_errors:
                self.rejected.append({
                    'record': record,
                    'rejection_reason': "; ".join(validation_errors)
                })
            else:
                self.accepted.append(record)
                
                self.results.append({
                    'record_id': idx,
                    'partner_id': record['partner_id'],
                    'reporting_month': record['reporting_month'],
                    'status': 'ACCEPTED',
                    'validation_timestamp': datetime.now().isoformat()
                })
    
    def generate_reports(self):
        "Generate validation reports."
        print("\n Generating reports")
        
        # Accepted data
        if self.accepted:
            df_accepted = pd.DataFrame(self.accepted)
            accepted_path = os.path.join(self.output_dir, "accepted_data.csv")
            df_accepted.to_csv(accepted_path, index=False)
            print(f"Accepted: {len(df_accepted)} records → {accepted_path}")
        
        # Rejected data with reasons
        if self.rejected:
            rejected_data = []
            for item in self.rejected:
                record = item['record'].copy()
                record['rejection_reason'] = item['rejection_reason']
                rejected_data.append(record)
            
            df_rejected = pd.DataFrame(rejected_data)
            rejected_path = os.path.join(self.output_dir, "rejected_data.csv")
            df_rejected.to_csv(rejected_path, index=False)
            print(f"Rejected: {len(df_rejected)} records → {rejected_path}")
            
            # Show rejection reasons
            print("\n Rejection Summary:")
            for reason, count in df_rejected['rejection_reason'].value_counts().items():
                print(f"   • {reason}: {count} records")
        
        # Validation summary
        summary = {
            'total_records': len(self.df) + len(self.rejected),
            'accepted': len(self.accepted),
            'rejected': len(self.rejected),
            'acceptance_rate': round(len(self.accepted) / (len(self.df) + len(self.rejected)) * 100, 1),
            'validation_timestamp': datetime.now().isoformat()
        }
        
        summary_path = os.path.join(self.output_dir, "validation_summary.json")
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)
        
        print(f"\n Validation Summary → {summary_path}")
        print(f"   Total: {summary['total_records']}")
        print(f"   Accepted: {summary['accepted']}")
        print(f"   Rejected: {summary['rejected']}")
        print(f"   Acceptance Rate: {summary['acceptance_rate']}%")
        
        return summary
    
    def run(self):
        "Main execution method."
        self.validate_all()
        return self.generate_reports()

if __name__ == "__main__":
    validator = DataValidator()
    validator.run()
    
    print("\n" + "=" * 60)
    print("Layer 3: Validation Complete!")
    print("Next: Layer 4 - Metrics Calculation")

LAYER 3: SOVEREIGN LOGIC - VALIDATION
Loaded 20 records from standardized data

 Running validation checks
Found 2 duplicate records

 Generating reports
Accepted: 18 records → reports\accepted_data.csv
Rejected: 2 records → reports\rejected_data.csv

 Rejection Summary:
   • Duplicate record for partner A001 in 2024-01-01: 2 records

 Validation Summary → reports\validation_summary.json
   Total: 20
   Accepted: 18
   Rejected: 2
   Acceptance Rate: 90.0%

Layer 3: Validation Complete!
Next: Layer 4 - Metrics Calculation
