# Column Statistics Analysis - Step 3: Compare PCDS and AWS Results

This notebook compares the column statistics collected from PCDS (step 1) and AWS (step 2).
It generates Excel reports, CSV summaries, and JSON results showing any differences.

## Cell 1: Import Required Libraries

In [None]:
import re
import os
import csv
import json
import pickle
import warnings
import numpy as np
import pandas as pd
import datetime as dt
import time

from upath import UPath
from loguru import logger
from tqdm import tqdm
from typing import Literal
from dataclasses import dataclass, field, fields
import xlwings as xw
from xlwings.constants import VAlign, HAlign
from PIL import ImageColor

warnings.filterwarnings('ignore', message=r'pandas only supports SQLAlchemy connectable .*', category=UserWarning)

# Note: This notebook uses Parquet format for cross-platform compatibility
# Install pyarrow if needed: pip install pyarrow or conda install -c conda-forge pyarrow
import pyarrow

get_rgb = ImageColor.getrgb

## Cell 2: Constants and Configuration

In [None]:
# --- Global Constants ---
SEP = '; '
AWS_DT_FORMAT = '%Y-%m-%d'
TODAY = dt.datetime.now()
WIDTH = 80

TPartition = Literal['whole', 'year', 'year_month', 'empty', 'year_week', 'week', 'snapshot']

## Cell 3: Core Data Types and Classes

In [None]:
class Timer:
    """Context manager for timing code execution"""
    
    def __enter__(self):
        self.start = time.perf_counter()
        return self
    
    def __exit__(self, exc_type, exc_value, exc_tb):
        pass

    @property
    def time(self):
        return time.perf_counter() - self.start
    
    def pause(self):
        """Return elapsed time and reset timer"""
        elapsed = self.time
        self.start = time.perf_counter()
        return elapsed

    @staticmethod
    def to_str(value):
        """Convert seconds to human-readable format"""
        minutes, seconds = divmod(value, 60)
        hours, minutes = divmod(minutes, 60)
        return f'{hours} hours {minutes} minutes {seconds:.0f} seconds'

@dataclass
class MetaOut:
    """Metadata output structure"""
    col2COL: dict
    col2type: dict
    infostr: str
    rowvar: str
    rowexclude: list
    rowtype: str
    nrows: int
    where: str

@dataclass(init=False)
class MetaJSON:
    """Container for metadata from previous meta analysis step"""
    pcds: MetaOut
    aws: MetaOut
    last_modified: str
    partition: TPartition = 'whole'
    tokenised_cols: list = field(default_factory=list)

    def __init__(self, **kwargs):
        field_names = [f.name for f in fields(self)]
        for k, v in kwargs.items():
            if k in field_names:
                setattr(self, k, v)
        
        def col2col(a_str, b_str, sep=SEP):
            return {k: v for k, v in zip(a_str.split(sep), b_str.split(sep))}
        
        for key, other in [('pcds', 'aws'), ('aws', 'pcds')]:
            out = MetaOut(
                rowvar=kwargs['%s_dt' % key],
                infostr=kwargs['%s_tbl' % key],
                where=kwargs['%s_where' % key],
                nrows=kwargs['%s_nrows' % key],
                col2COL=col2col(kwargs['%s_cols' % key], kwargs['%s_cols' % other]),
                col2type=col2col(kwargs['%s_cols' % key], kwargs['%s_types' % key]),
                rowtype=kwargs['%s_dt_type' % key],
                rowexclude=kwargs['%s_exclude' % key]
            )
            setattr(self, key, out)

@dataclass
class CSMeta:
    """Metadata for column statistics comparison"""
    pcds_table: str
    aws_table: str
    partition: TPartition
    vintage: str
    pcds_time: int
    aws_time: int

    def todict(self):
        return {f.name: getattr(self, f.name) for f in fields(self)}

@dataclass
class CSResult:
    """Results from column statistics comparison"""
    pcds_stats: pd.DataFrame
    aws_stats: pd.DataFrame
    miss_columns: set
    miss_details: dict
    meta_data: CSMeta

@dataclass
class SQLRecord:
    """Record tracking for SQL analysis"""
    name: str
    unmatched: set = field(default_factory=set)
    nrow: int = 0
    ncol: int = 0
    pcds_time: int = 0
    aws_time: int = 0

    def update(self, **kwargs):
        for k, v in kwargs.items():
            old_v = getattr(self, k)
            if k in ('unmatched',):
                self.unmatched |= v
            elif k in ('nrow', 'pcds_time', 'aws_time'):
                setattr(self, k, old_v + v)
            else:
                setattr(self, k, v)

    def toJSON(self):
        return {
            'Column Stats UnMatch': 'Yes' if len(self.unmatched) > 0 else 'No',
            'Stats UnMatch Details': SEP.join(self.unmatched),
            'Compared Dataset Shape': f'Row({self.nrow}) : Col({self.ncol})',
            'Execution Time': 'PCDS({}) : AWS({})'.format(Timer.to_str(self.pcds_time), Timer.to_str(self.aws_time))
        }

## Cell 4: Utility Functions

In [None]:
def start_run():
    logger.info('\n\n' + '=' * WIDTH)

def end_run():
    logger.info('\n\n' + '=' * WIDTH)

class IO:
    """File I/O utilities - uses Parquet/JSON for cross-platform compatibility"""

    @staticmethod
    def write_dataframe(file, df):
        """Save DataFrame in portable Parquet format"""
        file = UPath(file)
        df.to_parquet(file, index=True, engine='pyarrow', compression='snappy')

    @staticmethod
    def read_dataframe(file):
        """Load DataFrame from Parquet format"""
        file = UPath(file)
        return pd.read_parquet(file, engine='pyarrow')

    @staticmethod
    def write_json(file, data, cls=None):
        """Save to JSON with proper serialization"""
        import numpy as np
        import pandas as pd
        import datetime as dt

        def convert(obj):
            if isinstance(obj, (np.integer, np.floating)):
                return obj.item()
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif pd.isna(obj):
                return None
            elif isinstance(obj, (dt.datetime, dt.date)):
                return obj.isoformat()
            elif isinstance(obj, set):
                return list(obj)
            raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

        with open(file, 'w') as f:
            json.dump(data, f, indent=2, default=convert, cls=cls)

    @staticmethod
    def read_json(file):
        """Load from JSON"""
        with open(file, 'r') as f:
            return json.load(f)

    @staticmethod
    def write_pickle(file, data):
        """Deprecated: Use write_dataframe or write_json instead"""
        with open(file, 'wb') as f:
            pickle.dump(data, f)

    @staticmethod
    def read_pickle(file):
        """Deprecated: Use read_dataframe or read_json instead"""
        with open(file, 'rb') as f:
            return pickle.load(f)

    @staticmethod
    def read_meta_json(json_file):
        """Read metadata JSON and convert to MetaJSON objects"""
        data = IO.read_json(json_file)
        return {k: MetaJSON(**v) for k, v in data.items()}

    @staticmethod
    def delete_file(file):
        if (filepath := UPath(file)).exists():
            filepath.unlink()

## Cell 5: Date Parsing Helper

In [None]:
def parse_date_value(x, in_pcds=False, window=20):
    """Parse date value with special handling for PCDS format"""
    if in_pcds and re.match(r'^\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}$', str(x)):
        date_part = pd.to_datetime(str(x).split(" ")[0], format='%d-%m-%Y', dayfirst=True)
        if date_part.year > TODAY.year + window:
            date_part = date_part.replace(year=date_part.year - 100)
        return date_part.strftime(AWS_DT_FORMAT)
    try:
        return pd.to_datetime(x).strftime(AWS_DT_FORMAT)
    except (AttributeError, ValueError, TypeError):
        return str(x) if not pd.isna(x) else x

## Cell 6: Column Comparator Class

In [None]:
class ColumnComparator:
    """Enhanced column comparison engine for PCDS and AWS data"""
    
    statistics = {
        'col_type': 'Type',
        'col_count': 'N_Total',
        'col_distinct': 'N_Unique',
        'col_missing': 'N_Missing',
        'col_max': 'Max',
        'col_min': 'Min',
        'col_avg': 'Mean',
        'col_std': 'Std',
        'col_sum': 'Sum',
        'col_sum_sq': 'Sum_Square',
        'col_freq': 'Frequency'
    }

    def __init__(self):
        self.comparison_results = {}

    @staticmethod
    def get_value(row: pd.Series, column: str, is_pcds: bool = False):
        """Extract and parse value from statistics row"""
        value = row.get(column, np.nan) or np.nan
        is_date = bool(re.match(r'^date|time', str(row['col_type']), re.I))
        
        # Parse frequency distribution
        if column == 'col_freq' and isinstance(value, str):
            value = sorted([
                m.groups() for s in value.split('; ')
                if (m := re.search(r'([^(]*)\((\d+)\)', s.strip()))
            ], key=lambda x: (-int(x[1]), parse_date_value(x[0], is_pcds)))
        
        # Try numeric conversion
        try:
            return float(value)
        except (ValueError, TypeError):
            try:
                return int(value)
            except (ValueError, TypeError):
                return value

    @staticmethod
    def contains_datelike(dtype1, dtype2):
        """Check if either type is date/timestamp"""
        return bool({str(dtype1).lower(), str(dtype2).lower()} & {'date', 'timestamp'})

    def compare_statistics(
        self,
        pcds_stats: pd.DataFrame,
        aws_stats: pd.DataFrame,
        column_mapping: dict[str, str],
        tokenised_cols: list
    ) -> tuple[dict[str, any], pd.DataFrame, pd.DataFrame]:
        """Compare statistics between PCDS and AWS"""
        mismatched_columns = set()
        mismatched_details = {}
        
        # Align columns based on mapping
        aligned_pcds = pcds_stats.copy()
        aligned_aws = aws_stats.copy()
        
        # Remove tokenised columns and rename
        aligned_pcds = (
            aligned_pcds
            .drop(index=tokenised_cols, errors='ignore')
            .rename(index=column_mapping)
        )
        
        # Compare common columns
        common_columns = set(aligned_pcds.index) & set(aligned_aws.index)
        for column in common_columns:
            pcds_row = aligned_pcds.loc[column]
            aws_row = aligned_aws.loc[column]
            column_diffs = {}
            has_mismatch = False
            
            for stat, name in self.statistics.items():
                if stat in ('col_type',):
                    continue
                
                # Skip date frequency comparison
                if stat == 'col_freq' and self.contains_datelike(
                    pcds_row['col_type'], aws_row['col_type']
                ):
                    continue
                
                pcds_val = self.get_value(pcds_row, stat, True)
                aws_val = self.get_value(aws_row, stat)
                
                if self._values_different(pcds_val, aws_val):
                    column_diffs[name] = {'pcds': pcds_val, 'aws': aws_val}
                    has_mismatch = True
            
            if has_mismatch:
                mismatched_columns.add(column)
                mismatched_details[column] = column_diffs
                logger.warning(f"Mismatch found in column {column}: {column_diffs}")
        
        # Format output
        pcds_stats_formatted = (
            pcds_stats.loc[list(column_mapping)]
            [list(self.statistics)]
            .rename(columns=self.statistics)
        )
        aws_stats_formatted = (
            aws_stats.loc[[v for k, v in column_mapping.items()]]
            [list(self.statistics)]
            .rename(columns=self.statistics)
        )
        
        results = {
            'mismatched_columns': mismatched_columns,
            'mismatched_details': mismatched_details,
            'total_columns': len(common_columns),
            'matched_columns': len(common_columns) - len(mismatched_columns),
        }
        return results, pcds_stats_formatted, aws_stats_formatted

    @staticmethod
    def _values_different(val1, val2) -> bool:
        """Check if two values are different with tolerance"""
        # Handle list comparisons (frequency distributions)
        if isinstance(val1, list):
            if not isinstance(val2, list) or len(val1) != len(val2):
                return True
            flag = any(
                ColumnComparator._values_different(x1, x2)
                for t1, t2 in zip(val1, val2)
                for x1, x2 in zip(t1, t2)
            )
            return flag
        
        # Handle NaN values
        if pd.isna(val1) and pd.isna(val2):
            return False
        if val1 == 0 and pd.isna(val2):
            return False
        if pd.isna(val1) ^ pd.isna(val2):
            return True
        
        # Try date comparison
        try:
            dat1 = parse_date_value(val1, in_pcds=True)
            dat2 = parse_date_value(val2)
            return dat1 != dat2
        except (ValueError, TypeError):
            # Try numeric comparison
            try:
                num1, num2 = float(val1), float(val2)
                return not np.isclose(num1, num2, atol=1e-6, rtol=1e-6)
            except (ValueError, TypeError):
                # Fall back to string comparison
                return str(val1) != str(val2)

## Cell 7: Excel Report Generation Classes

In [None]:
class XS:
    """Excel styling helper"""
    
    def __init__(self, ws: xw.Range):
        self.ws = ws

    def get_color(self, color):
        if isinstance(color, str):
            return get_rgb(color)
        return color

    def apply_styles(self, pos='A1', value='', font={}, align='left', color='', border={}):
        """Apply styles to a cell"""
        cell = self.make_cell(pos)
        if value:
            cell.value = value
        if font:
            if 'family' in font:
                cell.font.name = font['family']
            if 'size' in font:
                cell.font.size = font['size']
            if 'color' in font:
                cell.font.color = self.get_color(font['color'])
            if 'bold' in font:
                cell.font.bold = font['bold']
        if color:
            cell.color = self.get_color(color)
        if 'style' in border:
            cell.api.Borders.LineStyle = border['style']
        if align == 'right':
            cell.api.HorizontalAlignment = HAlign.xlHAlignRight
        elif align == 'left':
            cell.api.HorizontalAlignment = HAlign.xlHAlignLeft
        elif align == 'center':
            cell.api.HorizontalAlignment = HAlign.xlHAlignCenter

    def make_cell(self, pos='A1', value=None):
        cell = self.ws.range(pos)
        if value is not None:
            cell.value = value
        return cell

    def write_dataframe(self, df: pd.DataFrame, pos='A1', header=True, index=False):
        cell = self.make_cell(pos)
        cell.options(index=index, header=header).value = df
        return cell

class ExcelReporter:
    """Excel reporter for column comparison results"""
    
    def __init__(self, workbook_path: str):
        self.workbook_path = UPath(workbook_path)
        self.app = None
        self.wb = None
        self.ns = -1
        self.cx, self.cy = None, None

    def __enter__(self):
        try:
            self.workbook_path.unlink(True)
        except PermissionError:
            xw.Book(self.workbook_path).close()
        self.app = xw.App(visible=True, add_book=False)
        self.wb = self.app.books.add()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.wb:
            self.wb.save(str(self.workbook_path))

    def create_comparison_report(self, comparison_results: dict[str, dict[str, CSResult]]):
        """Create Excel report with comparison results"""
        self._create_summary_sheet(comparison_results)
        for dataset_name, dataset_data in comparison_results.items():
            self._create_dataset_sheet(dataset_name, dataset_data)

    def _create_summary_sheet(self, comparison_results):
        """Create summary sheet with overview"""
        ws = self.wb.sheets[0]
        ws.name = 'SUMMARY'
        ws.range('A1').value = 'Column Statistics Comparison'
        ws.range('A1').font.bold = True
        ws.range('A1').font.size = 14
        headers = ['Dataset', 'Vintage', 'Total Columns', 'Matched Columns', 'Mismatched Columns', 'Match Rate %']
        ws.range('A3').value = headers
        ws.range('A3:F3').font.bold = True
        ws.range('A3:F3').color = (200, 200, 200)
        
        row = 4
        for dataset_name, dataset_data in comparison_results.items():
            for vintage, data in dataset_data.items():
                pcds_stats = data.pcds_stats
                matched = (total_cols := len(pcds_stats)) - (mismatched := len(data.miss_columns))
                match_rate = (matched / total_cols * 100) if total_cols > 0 else 0
                ws.range(f'A{row}').value = [dataset_name, vintage, total_cols, matched, mismatched, f'{match_rate:.1f}%']
                if match_rate >= 95:
                    ws.range(f'A{row}:F{row}').color = (200, 255, 200)
                else:
                    ws.range(f'A{row}:F{row}').color = (255, 200, 200)
                row += 1
        ws.autofit()
        self.ns += 1

    def _create_dataset_sheet(self, name: str, result_d: dict[str, CSResult]):
        """Create detailed sheet for a specific dataset"""
        wb = self.wb
        # Truncate name to 31 characters (Excel limit)
        sheet_name = name.upper()[:31]
        try:
            ws = wb.sheets.add(sheet_name, after=wb.sheets[self.ns])
        except ValueError:
            ws = wb.sheets[sheet_name]
        finally:
            ws.clear()
        xs = XS(ws)
        
        row = 1
        for vintage, data in result_d.items():
            # Write vintage header
            xs.make_cell(pos=f'A{row}', value='Vintage: ')
            xs.apply_styles(pos=f'B{row}', value=vintage, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(190, 190, 190))
            row += 2
            
            # Write PCDS statistics
            pcds_tbl = data.meta_data.pcds_table.split('.')[-1]
            xs.make_cell(pos=f'A{row}', value='PCDS: ')
            xs.apply_styles(pos=f'B{row}', value=pcds_tbl, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(240, 240, 240))
            row += 1
            
            # Reorder columns to show mismatches first
            aws_view = data.aws_stats.T.map(self._format_cell_value)
            indices = [i for i, x in enumerate(aws_view.columns) if x in data.miss_columns]
            the_rest = [i for i in range(len(aws_view.columns)) if i not in indices]
            aws_view = aws_view[aws_view.columns[indices + the_rest]]
            pcds_view = data.pcds_stats.T.map(self._format_cell_value)
            pcds_view = pcds_view[pcds_view.columns[indices + the_rest]]
            
            self.cx, self.cy = 2, row + 1
            xs.write_dataframe(pcds_view, f'B{row}', index=True)
            
            # Write AWS statistics
            row += len(pcds_view) + 2
            xs.make_cell(pos=f'A{row}', value='AWS: ')
            aws_tbl = data.meta_data.aws_table.lower()
            xs.apply_styles(pos=f'B{row}', value=aws_tbl, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(240, 240, 240))
            row += 1
            xs.write_dataframe(aws_view, f'B{row}', index=True)
            row += len(aws_view) + 3
            
            self._highlight_differences(ws, nx=len(indices), ny=len(pcds_view) - 1)
            row += 2
        ws.autofit()
        self.ns += 1

    def _highlight_differences(self, ws: xw.Sheet, nx: int, ny: int):
        """Highlight differences between PCDS and AWS"""
        ix, iy = self.cx, self.cy
        for i in range(iy, iy + ny):
            for j in range(ix, ix + nx):
                pcds, aws = ws[i, j], ws[i + ny + 4, j]
                pcds.number_format = '0.00'
                aws.number_format = '0.00'
                if pcds.value == aws.value:
                    pcds.font.color = get_rgb('green')
                    aws.font.color = get_rgb('green')
                else:
                    pcds.font.color = get_rgb('red')
                    aws.font.color = get_rgb('red')

    def _format_cell_value(self, value) -> str:
        """Format cell value for display"""
        if pd.isna(value):
            return ''
        elif isinstance(value, (int, float)):
            if isinstance(value, float) and value.is_integer():
                return str(int(value))
            return str(value)
        else:
            str_val = str(value)
            return str_val[:50] + '...' if len(str_val) > 50 else str_val

def create_comparison_report(comparison_results: dict[str, dict[str, CSResult]], output_path: UPath):
    """Create comparison report from results dictionary"""
    with ExcelReporter(output_path) as reporter:
        reporter.create_comparison_report(comparison_results)

## Cell 8: Main Execution - Compare Results

In [None]:
def main_compare():
    """Main execution function for column statistics comparison"""

    # Configuration - adjust these paths as needed
    meta_json_path = 'path/to/meta_analysis_output.json'  # From meta_analysis step
    meta_csv_path = 'path/to/meta_analysis.csv'
    pcds_summary_path = 'output/column_stats_pcds/pcds_summary.json'  # From step 1
    aws_summary_path = 'output/column_stats_aws/aws_summary.json'  # From step 2

    output_folder = UPath('output/column_stats_compare')
    output_folder.mkdir(exist_ok=True, parents=True)

    csv_output = output_folder / 'comparison_summary.csv'
    json_output = output_folder / 'comparison_results.json'
    excel_output = output_folder / 'comparison_report.xlsx'

    start_run()

    # Load metadata from meta_analysis step
    meta_json = IO.read_meta_json(meta_json_path)
    meta_csv = pd.read_csv(meta_csv_path)

    # Load summary files from steps 1 and 2
    logger.info("Loading PCDS summary from step 1...")
    if not UPath(pcds_summary_path).exists():
        logger.error(f"PCDS summary not found: {pcds_summary_path}")
        logger.error("Please run Step 1 (column_statistics_1_pcds.ipynb) first!")
        return

    pcds_summary = IO.read_json(pcds_summary_path)

    logger.info("Loading AWS summary from step 2...")
    if not UPath(aws_summary_path).exists():
        logger.error(f"AWS summary not found: {aws_summary_path}")
        logger.error("Please run Step 2 (column_statistics_2_aws.ipynb) first!")
        return

    aws_summary = IO.read_json(aws_summary_path)

    CC = ColumnComparator()
    ALL_RESULT = {}
    HAS_HEADER = False

    # CSV columns for summary
    csv_columns = [
        'Dataset',
        'Vintage',
        'Column Stats UnMatch',
        'Stats UnMatch Details',
        'Compared Dataset Shape',
        'Execution Time'
    ]

    for i, row in tqdm(meta_csv.iterrows(), desc='Comparing datasets...', total=len(meta_csv)):
        name = row.get('PCDS Table Details with DB Name')
        logger.info(f"Processing dataset: {name}")

        # Load metadata for this table
        meta_info = meta_json.get(name)
        if not meta_info:
            logger.warning(f"No metadata found for {name}")
            continue

        # Check if we have results for this dataset
        if name not in pcds_summary or name not in aws_summary:
            logger.warning(f"Missing results for {name}")
            continue

        meta_pcds = meta_info.pcds
        partition = meta_info.partition

        if partition == 'empty':
            continue

        DATA_RESULT = {}
        record = SQLRecord(name=name)

        # Get vintages from summaries
        pcds_vintages = pcds_summary[name]
        aws_vintages = aws_summary[name]

        # Process each vintage
        common_vintages = set(pcds_vintages.keys()) & set(aws_vintages.keys())

        for vintage in common_vintages:
            logger.info(f"Comparing vintage: {vintage}")

            # Load statistics from parquet files
            pcds_info = pcds_vintages[vintage]
            aws_info = aws_vintages[vintage]

            pcds_stats_file = UPath(pcds_info['stats_file'])
            aws_stats_file = UPath(aws_info['stats_file'])

            if not pcds_stats_file.exists() or not aws_stats_file.exists():
                logger.warning(f"Missing stats files for {name}/{vintage}")
                continue

            pcds_stats = IO.read_dataframe(pcds_stats_file)
            aws_stats = IO.read_dataframe(aws_stats_file)

            # Get timing information
            pcds_time = pcds_info['meta_data'].get('pcds_time', 0)
            aws_time = aws_info['meta_data'].get('aws_time', 0)

            # Compare statistics
            comparison_result, pcds_stats_formatted, aws_stats_formatted = CC.compare_statistics(
                pcds_stats, aws_stats, meta_pcds.col2COL, meta_info.tokenised_cols
            )

            # Store results
            DATA_RESULT[vintage] = CSResult(
                pcds_stats=pcds_stats_formatted,
                aws_stats=aws_stats_formatted,
                miss_columns=comparison_result['mismatched_columns'],
                miss_details=comparison_result['mismatched_details'],
                meta_data=CSMeta(
                    pcds_table=meta_pcds.infostr,
                    aws_table=meta_info.aws.infostr,
                    partition=partition,
                    vintage=vintage,
                    pcds_time=pcds_time,
                    aws_time=aws_time,
                )
            )

            # Update record
            record.update(
                unmatched=comparison_result['mismatched_columns'],
                nrow=int(pcds_stats_formatted['N_Total'].max()),
                ncol=comparison_result['total_columns'],
                pcds_time=pcds_time,
                aws_time=aws_time
            )

            logger.info(
                f"Vintage {vintage}: {comparison_result['matched_columns']}/{comparison_result['total_columns']} "
                f"columns matched"
            )

        ALL_RESULT[name] = DATA_RESULT

        # Write results to CSV
        with open(csv_output, 'a+', newline='') as fp:
            writer = csv.DictWriter(fp, fieldnames=csv_columns)
            if not HAS_HEADER:
                writer.writeheader()
                HAS_HEADER = True

            for vintage in DATA_RESULT.keys():
                csv_row = {
                    'Dataset': name,
                    'Vintage': vintage,
                    **record.toJSON()
                }
                writer.writerow(csv_row)

    # Save results to JSON
    logger.info(f"Saving comparison results to {json_output}")
    IO.write_json(json_output, {
        dataset: {
            vintage: {
                'mismatched_columns': list(data.miss_columns),
                'mismatched_details': data.miss_details,
                'metadata': data.meta_data.todict()
            } for vintage, data in vintages.items()
        } for dataset, vintages in ALL_RESULT.items()
    })

    # Create Excel report
    logger.info(f"Creating Excel report at {excel_output}")
    create_comparison_report(ALL_RESULT, excel_output)
    logger.info(f"Excel report created: {excel_output}")

    # Summary statistics
    total_datasets = len(ALL_RESULT)
    total_comparisons = sum(len(vintages) for vintages in ALL_RESULT.values())
    datasets_with_issues = sum(
        1 for vintages in ALL_RESULT.values()
        if any(len(data.miss_columns) > 0 for data in vintages.values())
    )

    logger.info(f"\n{'='*80}")
    logger.info("Comparison Summary:")
    logger.info(f"  Total datasets compared: {total_datasets}")
    logger.info(f"  Total comparisons (including vintages): {total_comparisons}")
    logger.info(f"  Datasets with mismatches: {datasets_with_issues}")
    logger.info(f"  CSV summary: {csv_output}")
    logger.info(f"  JSON results: {json_output}")
    logger.info(f"  Excel report: {excel_output}")
    logger.info(f"{'='*80}\n")

    end_run()
    return ALL_RESULT

if __name__ == '__main__':
    results = main_compare()

## Run the Comparison

Uncomment the cell below to run the comparison:

In [None]:
# results = main_compare()