# Column Statistics Analysis - Comparing PCDS and AWS Data

This notebook performs detailed column-level statistical analysis comparing data between PCDS (Oracle) and AWS (Athena) databases.
It computes and compares:
- Count, distinct values, min/max
- Mean, standard deviation, sum
- Frequency distributions
- Missing values

## Cell 1: Import Required Libraries

In [None]:
import re
import os
import csv
import json
import shutil
import argparse
import warnings
import numpy as np
import pandas as pd
import functools as ft
import datetime as dt
import multiprocessing as mp
import threading as td
import time

from upath import UPath
from loguru import logger
from tqdm import tqdm
from typing import get_args, Literal, Dict, List
from dataclasses import dataclass, field, fields
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed
from enum import Enum
import xlwings as xw
from xlwings.constants import VAlign, HAlign
from PIL import ImageColor

warnings.filterwarnings('ignore', message=r'pandas only supports SQLAlchemy connectable .*', category=UserWarning)

get_rgb = ImageColor.getrgb

## Cell 2: Constants and Configuration

In [None]:
# --- Global Constants ---
SEP = '; '
AWS_DT_FORMAT = '%Y-%m-%d'
TODAY = dt.datetime.now()
WIDTH = 80
inWindows = os.name == 'nt'

TPartition = Literal['whole', 'year', 'year_month', 'empty', 'year_week', 'week', 'snapshot']

# --- SQL Templates for AWS Column Statistics ---
AWS_Cont_SQL = """
SELECT
    '{data_type}' AS col_type,
    COUNT({column_name}) AS col_count,
    COUNT(DISTINCT {column_name}) AS col_distinct,
    MAX({column_name}) AS col_max,
    MIN({column_name}) AS col_min,
    AVG(CAST({column_name} AS DOUBLE)) AS col_avg,
    STDDEV_SAMP(CAST({column_name} AS DOUBLE)) AS col_std,
    SUM(CAST({column_name} AS DOUBLE)) AS col_sum,
    SUM(CAST({column_name} AS DOUBLE) * CAST({column_name} AS DOUBLE)) AS col_sum_sq,
    '' AS col_freq,
    COUNT(*) - COUNT({column_name}) AS col_missing
FROM {db}.{table}
WHERE {limit};
"""

AWS_Catg_SQL = """
WITH FreqTable_RAW AS (
    SELECT
        {column_name} AS p_col,
        COUNT(*) AS value_freq
    FROM  {db}.{table}
    WHERE {limit}
    GROUP BY {column_name}
),FreqTable AS (
    SELECT
        p_col, value_freq, 
        ROW_NUMBER() OVER (ORDER BY value_freq DESC, p_col ASC) AS rn
    FROM FreqTable_RAW
)
SELECT
    '{data_type}' AS col_type,
    SUM(value_freq) AS col_count,
    COUNT(value_freq) AS col_distinct,
    MAX(value_freq) AS col_max,
    MIN(value_freq) AS col_min,
    AVG(CAST(value_freq AS DOUBLE)) AS col_avg,
    STDDEV_SAMP(CAST(value_freq AS DOUBLE)) AS col_std,
    SUM(value_freq) AS col_sum,
    SUM(value_freq * value_freq) AS col_sum_sq,
    (SELECT ARRAY_JOIN(ARRAY_AGG(COALESCE(CAST(p_col AS VARCHAR), '') || '(' || CAST(value_freq AS VARCHAR) || ')' ORDER BY value_freq DESC), '; ') FROM FreqTable WHERE rn <= 10) AS col_freq, 
    (SELECT COALESCE(value_freq, 0) FROM FreqTable Where p_col is NULL) AS col_missing
FROM FreqTable
"""

# --- SQL Template for PCDS Column Statistics ---
PCDS_SQL_COLUMN = """
DECLARE
    v_sql          VARCHAR2(20000);
    v_col_name     VARCHAR2(128);
    v_data_type    VARCHAR2(128);
    v_table_name   VARCHAR2(128) := UPPER('{table}');

    v_ret_data_type  VARCHAR2(128);
    v_count          NUMBER;
    v_distinct       NUMBER;
    v_max            VARCHAR2(4000);
    v_min            VARCHAR2(4000);
    v_missing        NUMBER;
    v_avg            NUMBER;
    v_std            NUMBER;
    v_sum            NUMBER;
    v_sum_sq         NUMBER;
    v_freq           VARCHAR2(4000);

    l_column_ref VARCHAR2(256);

BEGIN
    FOR rec IN (
        SELECT column_name, data_type
        FROM all_tab_cols
        WHERE table_name = v_table_name
        ORDER BY column_id
    ) LOOP
        v_col_name := rec.column_name;
        v_data_type := rec.data_type;

        v_ret_data_type := NULL; v_count := NULL; v_distinct := NULL; 
        v_max := NULL; v_min := NULL; v_missing := NULL; 
        v_avg := NULL; v_std := NULL; v_sum := NULL; v_sum_sq := NULL; v_freq := NULL;
        
        IF v_data_type LIKE 'TIMESTAMP%' THEN 
            l_column_ref := 'TRUNC(' || v_col_name || ')';
        ELSE
            l_column_ref := v_col_name;
        END IF;

        IF v_data_type IN ('NUMBER', 'FLOAT', 'BINARY_FLOAT', 'BINARY_DOUBLE') THEN
            v_sql := 'SELECT ';
            v_sql := v_sql || '''' || v_data_type || ''' AS col_type, ';
            v_sql := v_sql || 'COUNT(' || v_col_name || ') AS col_count, ';
            v_sql := v_sql || 'COUNT(DISTINCT ' || v_col_name || ') AS col_distinct, ';
            v_sql := v_sql || 'MAX(' || v_col_name || ') AS col_max, ';
            v_sql := v_sql || 'MIN(' || v_col_name || ') AS col_min, ';
            v_sql := v_sql || 'AVG(' || v_col_name || ') AS col_avg, ';
            v_sql := v_sql || 'STDDEV_SAMP(' || v_col_name || ') AS col_std, ';
            v_sql := v_sql || 'SUM(' || v_col_name || ') AS col_sum, ';
            v_sql := v_sql || 'SUM(' || v_col_name || ' * ' || v_col_name || ') AS col_sum_sq, ';
            v_sql := v_sql || 'COUNT(*) - COUNT(' || v_col_name || ') AS col_missing, ';
            v_sql := v_sql || 'EMPTY_CLOB() AS col_freq ';
            v_sql := v_sql || 'FROM ' || v_table_name || ' WHERE {limit} ';
        ELSE
            v_sql := 'WITH FreqTable_RAW AS ( ';
            v_sql := v_sql || 'SELECT ' || l_column_ref || ' AS p_col, COUNT(*) AS value_freq '; 
            v_sql := v_sql || 'FROM ' || v_table_name || ' WHERE {limit} '; 
            v_sql := v_sql || 'GROUP BY ' || l_column_ref || ' '; 
            v_sql := v_sql || '), FreqTable AS ( '; 
            v_sql := v_sql || 'SELECT p_col, value_freq, '; 
            v_sql := v_sql || 'ROW_NUMBER() OVER (ORDER BY value_freq DESC, p_col ASC) AS rn '; 
            v_sql := v_sql || 'FROM FreqTable_RAW), AggStats AS ( ';
            v_sql := v_sql || 'SELECT SUM(ft.value_freq) AS col_count, '; 
            v_sql := v_sql || 'COUNT(ft.value_freq) AS col_distinct, '; 
            v_sql := v_sql || 'MAX(ft.value_freq) AS col_max, '; 
            v_sql := v_sql || 'MIN(ft.value_freq) AS col_min, '; 
            v_sql := v_sql || 'AVG(ft.value_freq) AS col_avg, '; 
            v_sql := v_sql || 'STDDEV_SAMP(ft.value_freq) AS col_std, ';
            v_sql := v_sql || 'SUM(ft.value_freq) AS col_sum, ';
            v_sql := v_sql || 'SUM(ft.value_freq * ft.value_freq) AS col_sum_sq '; 
            v_sql := v_sql || 'FROM FreqTable ft) SELECT ';
            v_sql := v_sql || '''' || v_data_type || ''' AS col_type, ast.*, '; 
            v_sql := v_sql || '(SELECT NVL(value_freq, 0) FROM FreqTable WHERE p_col IS NULL) AS col_missing, '; 
            v_sql := v_sql || '(SELECT LISTAGG(p_col || ''('' || value_freq || '')'', ''; '') WITHIN GROUP (ORDER BY value_freq DESC) FROM FreqTable WHERE rn <= 10) AS col_freq ';
            v_sql := v_sql || 'FROM AggStats ast';
        END IF;

        EXECUTE IMMEDIATE v_sql INTO
           v_ret_data_type, v_count, v_distinct, v_max, v_min, 
           v_avg, v_std, v_sum, v_sum_sq, v_missing, v_freq;

        DBMS_OUTPUT.PUT_LINE('Column: ' || v_col_name);
        DBMS_OUTPUT.PUT_LINE('  col_type: ' || v_ret_data_type); 
        DBMS_OUTPUT.PUT_LINE('  col_count: ' || v_count);
        DBMS_OUTPUT.PUT_LINE('  col_distinct: ' || v_distinct);
        DBMS_OUTPUT.PUT_LINE('  col_max: ' || v_max);
        DBMS_OUTPUT.PUT_LINE('  col_min: ' || v_min);
        DBMS_OUTPUT.PUT_LINE('  col_avg: ' || v_avg);
        DBMS_OUTPUT.PUT_LINE('  col_std: ' || v_std);
        DBMS_OUTPUT.PUT_LINE('  col_sum: ' || v_sum);
        DBMS_OUTPUT.PUT_LINE('  col_sum_sq: ' || v_sum_sq);
        DBMS_OUTPUT.PUT_LINE('  col_freq: ' || v_freq);
        DBMS_OUTPUT.PUT_LINE('  col_missing: ' || v_missing);
        DBMS_OUTPUT.PUT_LINE('---');
    END LOOP;
END;
"""

## Cell 3: Core Data Types and Classes

In [None]:
class Timer:
    """Context manager for timing code execution"""
    
    def __enter__(self):
        self.start = time.perf_counter()
        return self
    
    def __exit__(self, exc_type, exc_value, exc_tb):
        pass

    @property
    def time(self):
        return time.perf_counter() - self.start
    
    def pause(self):
        """Return elapsed time and reset timer"""
        elapsed = self.time
        self.start = time.perf_counter()
        return elapsed

    @staticmethod
    def to_str(value):
        """Convert seconds to human-readable format"""
        minutes, seconds = divmod(value, 60)
        hours, minutes = divmod(minutes, 60)
        return f'{hours} hours {minutes} minutes {seconds:.0f} seconds'

@dataclass
class MetaOut:
    """Metadata output structure"""
    col2COL: dict
    col2type: dict
    infostr: str
    rowvar: str
    rowexclude: list
    rowtype: str
    nrows: int
    where: str

    def update(self, **kwargs):
        field_names = [f.name for f in fields(self)]
        for k, v in kwargs.items():
            if k in field_names:
                setattr(self, k, v)

@dataclass(init=False)
class MetaJSON:
    """Container for metadata from previous meta analysis step"""
    aws: MetaOut
    pcds: MetaOut
    last_modified: str
    partition: TPartition = 'whole'
    tokenised_cols: list = field(default_factory=list)

    def __init__(self, **kwargs):
        field_names = [f.name for f in fields(self)]
        for k, v in kwargs.items():
            if k in field_names:
                setattr(self, k, v)
        
        #>>> Build MetaOut objects for PCDS and AWS <<<#
        def col2col(a_str, b_str, sep=SEP):
            return {k: v for k, v in zip(a_str.split(sep), b_str.split(sep))}
        
        for key, other in [('pcds', 'aws'), ('aws', 'pcds')]:
            out = MetaOut(
                rowvar=kwargs['%s_dt' % key],
                infostr=kwargs['%s_tbl' % key],
                where=kwargs['%s_where' % key],
                nrows=kwargs['%s_nrows' % key],
                col2COL=col2col(kwargs['%s_cols' % key], kwargs['%s_cols' % other]),
                col2type=col2col(kwargs['%s_cols' % key], kwargs['%s_types' % key]),
                rowtype=kwargs['%s_dt_type' % key],
                rowexclude=kwargs['%s_exclude' % key]
            )
            setattr(self, key, out)

@dataclass
class CSMeta:
    """Metadata for column statistics comparison"""
    pcds_table: str
    aws_table: str
    partition: TPartition
    vintage: str
    pcds_time: int
    aws_time: int

    def todict(self):
        return {f.name: getattr(self, f.name) for f in fields(self)}

@dataclass
class CSResult:
    """Results from column statistics comparison"""
    pcds_stats: pd.DataFrame
    aws_stats: pd.DataFrame
    miss_columns: set
    miss_details: dict
    meta_data: CSMeta

@dataclass
class SQLRecord:
    """Record tracking for SQL analysis"""
    name: str
    unmatched: set = field(default_factory=set)
    nrow: int = 0
    ncol: int = 0
    pcds_time: int = 0
    aws_time: int = 0

    def update(self, **kwargs):
        for k, v in kwargs.items():
            old_v = getattr(self, k)
            if k in ('unmatched',):
                self.unmatched |= v
            elif k in ('nrow', 'pcds_time', 'aws_time'):
                setattr(self, k, old_v + v)
            else:
                setattr(self, k, v)

    def toJSON(self):
        return {
            'Column Stats UnMatch': 'Yes' if len(self.unmatched) > 0 else 'No',
            'Stats UnMatch Details': SEP.join(self.unmatched),
            'Compared Dataset Shape': f'Row({self.nrow}) : Col({self.ncol})',
            'Execution Time': 'PCDS({}) : AWS({})'.format(Timer.to_str(self.pcds_time), Timer.to_str(self.aws_time))
        }

## Cell 4: Configuration Reading

In [None]:
# Configuration classes would normally be loaded from confection
# For this notebook, we'll define minimal versions

@dataclass
class CSInput:
    """Input configuration"""
    name: str
    step: str
    prev: str
    env: UPath
    folder: UPath
    csv: UPath
    json: UPath

@dataclass
class MetaCSV:
    """CSV configuration"""
    file: UPath
    columns: list

@dataclass
class CSOutput:
    """Output configuration"""
    folder: UPath
    pkl: UPath
    csv: MetaCSV
    xlsx: UPath
    json: UPath

@dataclass
class ColumnConfig:
    """Column exclusion configuration"""
    pii_cols: list[str]
    token_cols: list[str]

@dataclass
class CSCompare:
    """Comparison configuration"""
    n_process: int
    drop_na: bool
    exclude: ColumnConfig

@dataclass
class CSConfig:
    """Main configuration"""
    input: CSInput
    output: CSOutput
    compare: CSCompare

#--- Parse command line arguments and load configuration ---#
def parse_config() -> CSConfig:
    parser = argparse.ArgumentParser(description='Conduct Column Stats Analysis')
    parser.add_argument(
        '--name', type=str,
        default='test_debug',
        help='how to name this analysis (override)'
    )
    args = parser.parse_args()
    
    # Load configuration from file
    # Implementation depends on your config file format
    raise NotImplementedError("Implement config loading")

## Cell 5: Utility Functions

In [None]:
#--- Start logging session ---#
def start_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- End logging session ---#
def end_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- Load environment variables ---#
def load_env(file):
    if inWindows:
        from dotenv import load_dotenv
        load_dotenv(file)

#--- Renew AWS credentials ---#
def aws_creds_renew(seconds=0):
    # Implement AWS credential renewal
    pass

#--- Check if statistics DataFrame is empty ---#
def is_stat_empty(df: pd.DataFrame):
    return df['col_count'].sum() == 0

class IO:
    """File I/O utilities"""
    
    @staticmethod
    def write_json(file, data, cls=None):
        with open(file, 'w') as f:
            json.dump(data, f, indent=2, cls=cls)

    @staticmethod
    def read_json(file):
        with open(file, 'r') as fp:
            return json.load(fp)

    @staticmethod
    def read_meta_json(json_file):
        """Read metadata JSON and convert to MetaJSON objects"""
        data = IO.read_json(json_file)
        return {k: MetaJSON(**v) for k, v in data.items()}

    @staticmethod
    def delete_file(file):
        if (filepath := UPath(file)).exists():
            filepath.unlink()

class S3:
    """S3 utilities"""
    
    @staticmethod
    def upload(df: pd.DataFrame, s3_url: UPath):
        """Upload DataFrame to S3 as parquet"""
        import io
        aws_creds_renew()
        out_buffer = io.BytesIO()
        df.to_parquet(out_buffer, index=False)
        out_buffer.seek(0)
        UPath(s3_url).write_bytes(out_buffer.getvalue())
        logger.info(f"Uploading DataFrame to {s3_url} [finished]")

## Cell 6: Date Handling Functions

In [None]:
#--- Calculate ISO week date range ---#
def get_iso_week_dates(year, week):
    jan01, dec31 = dt.datetime(year, 1, 1), dt.datetime(year, 12, 31)
    first_day = jan01 - dt.timedelta(days=jan01.weekday())
    start = first_day + dt.timedelta(weeks=week - 1)
    end = start + dt.timedelta(days=6)
    start, end = max(start, jan01), min(end, dec31)
    return start.strftime('%Y-%m-%d'), end.strftime('%Y-%m-%d')

#--- Parse date format specification from variable name ---#
def parse_format_date(str_w_format):
    pattern = r'^(.+?)(?:\s*\(([^)]+)\))?$'
    return re.match(pattern, str_w_format)

#--- Convert date exclusion clause to appropriate SQL format ---#
def parse_exclude_date(exclude_clause):
    """
    Convert date exclusions between Oracle and Athena formats:
    1. Oracle: TO_CHAR(EFF_DT, 'YYYY-MM-DD') not in ('2025-01-22','2025-01-24')
       to EFF_DT NOT IN (TO_DATE('2025-01-22', 'YYYY-MM-DD'), TO_DATE('2025-01-24', 'YYYY-MM-DD'))
    2. Athena: DATE_FORMAT(DATE_PARSE(dw_bus_dt, '%Y%m%d'), '%Y-%m-%d') not in ('2025-01-22', '2025-01-24')
       to dw_bus_dt NOT IN ('20250122', '20250124')
    """
    #>>> Handle Oracle format <<<#
    p1 = r"TO_CHAR\((?P<col>\w+),\s*'YYYY-MM-DD'\)\s+(?P<op>not in|in)\s+\((?P<dates>.*?)\)"
    if m := re.match(p1, exclude_clause, flags=re.I):
        col, op, dates = m.groups()
        new_dates = ', '.join(f"DATE {date.strip()}" for date in dates.split(','))
        return '%s %s (%s)' % (col, op, new_dates)
    
    #>>> Handle Athena format <<<#
    p2 = r"DATE_FORMAT\(DATE_PARSE\((?P<col>\w+),\s*'(?P<fmt>%Y%m%d)'\),\s*'%Y-%m-%d'\)\s+(?P<op>not in|in)\s+\((?P<dates>.*?)\)"
    if m := re.match(p2, exclude_clause, flags=re.I):
        col, fmt, op, dates = m.groups()
        new_dates = ', '.join(
            "'%s'" % dt.datetime.strptime(date.strip("'"), '%Y-%m-%d').strftime(fmt)
            for date in dates.split(',')
        )
        return '%s %s (%s)' % (col, op, new_dates)
    return exclude_clause

#--- Build PCDS WHERE clause with date filtering ---#
def get_pcds_where(date_var, date_type, date_partition, date_range, date_format, snapshot=None, exclude_clauses=[]):
    #>>> Handle character-based dates <<<#
    if date_type and ('char' in date_type.lower() or 'varchar' in date_type.lower()):
        date_var = f"TO_DATE({date_var}, '{date_format}')"
    
    if snapshot:
        return ' AND '.join(parse_exclude_date(x) for x in exclude_clauses if x)
    elif date_partition == 'whole':
        base_clause = "1=1"
    elif date_partition == 'year':
        start_dt = f"TO_DATE('{date_range}-01-01', 'YYYY-MM-DD')"
        end_dt = f"TO_DATE('{date_range}-12-31', 'YYYY-MM-DD')"
        base_clause = f"{date_var} >= {start_dt} AND {date_var} <= {end_dt}"
    elif date_partition == 'year_month':
        start_dt = f"TO_DATE('{date_range}', 'YYYY-MM')"
        end_dt = f"LAST_DAY(TO_DATE('{date_range}', 'YYYY-MM'))"
        base_clause = f"{date_var} >= {start_dt} AND {date_var} <= {end_dt}"
    elif date_partition in ('year_week', 'week'):
        year, week = date_range.split('-W')
        start_dt, end_dt = get_iso_week_dates(int(year), int(week))
        base_clause = f"{date_var} >= DATE '{start_dt}' AND {date_var} <= DATE '{end_dt}'"
    elif date_partition == 'daily':
        target_dt = f"TO_DATE('{date_range}', 'YYYY-MM-DD')"
        base_clause = f"{date_var} = {target_dt}"
    else:
        raise ValueError(f"Unsupported partition type: {date_partition}")
    
    #>>> Add exclusions if provided <<<#
    if (exclude_clauses := [x for x in exclude_clauses if x]):
        exclude_clause = ' AND '.join(parse_exclude_date(x) for x in exclude_clauses if x)
        return f"({base_clause}) AND ({exclude_clause})"
    else:
        return base_clause

#--- Build AWS WHERE clause with date filtering ---#
def get_aws_where(date_var, date_type, date_partition, date_range, date_format, snapshot=None, exclude_clauses=[]):
    #>>> Handle variable=value format <<<#
    if '=' in date_range:
        _date_var, date_range = date_range.split('=', 1)
        assert date_var.split()[0] == _date_var, f"Date Variable Should Match: {date_var} vs {_date_var}"
    
    #>>> Extract format from date_var if present <<<#
    if (m := parse_format_date(date_var)):
        date_var, date_format = m.groups()
    
    #>>> Handle string/varchar dates that need parsing <<<#
    if date_type and re.match(r'^(string|varchar)', date_type, re.IGNORECASE):
        if date_format:
            date_var = f"DATE_PARSE({date_var}, '{date_format}')"
        else:
            date_var = f"DATE_PARSE({date_var}, '%Y%m%d')"
    
    if snapshot:
        return ' AND '.join('(%s)' % parse_exclude_date(x) for x in exclude_clauses if x)
    elif date_partition == 'whole':
        base_clause = "1=1"
    elif date_partition == 'year':
        base_clause = f"DATE_FORMAT({date_var}, '%Y') = '{date_range}'"
    elif date_partition == 'year_month':
        base_clause = f"DATE_FORMAT({date_var}, '%Y-%m') = '{date_range}'"
    elif date_partition in ('year_week', 'week'):
        if '-W' in date_range:
            year, week = date_range.split('-W')
        else:
            year, week = map(int, date_range.split('-'))
            week = f"W{week:02d}"
        base_clause = f"DATE_FORMAT({date_var}, '%Y-%v') = '{year}-{week}'"
    elif date_partition == 'daily':
        base_clause = f"DATE({date_var}) = DATE('{date_range}')"
    else:
        raise ValueError(f"Unsupported partition type: {date_partition}")
    
    if (exclude_clauses := [x for x in exclude_clauses if x]):
        exclude_clause = ' AND '.join('(%s)' % parse_exclude_date(x) for x in exclude_clauses if x)
        return f"({base_clause}) AND ({exclude_clause})"
    else:
        return base_clause

#--- Query database to get available vintages (time periods) ---#
def get_vintages_from_data(info_str, date_var, date_type, date_format, partition_type, where_clause="1=1"):
    """
    Get available time partitions from database based on partition type.
    Returns list of vintage strings (e.g., '2024', '2024-01', '2024-W01') in reverse chronological order.
    """
    # Implementation depends on your SQL engines
    # This would query the database to find available date ranges
    pass

## Cell 7: SQL Engine and Database Connections

In [None]:
import pandas.io.sql as psql

class SQLengine:
    """SQL query engine for PCDS and AWS"""
    
    def __init__(self, platform: Literal['PCDS', 'AWS']):
        self._platform = platform
        self.reset()

    def reset(self):
        self._where = None
        self._type = None
        self._date = None
        self._dateraw = None
        self._table = None

    def query(self, query, connection, **query_kwargs):
        """Execute SQL query and return DataFrame"""
        df = psql.read_sql_query(query, connection, **query_kwargs)
        if self._platform == 'PCDS':
            df.columns = [x.upper() for x in df.columns]
        else:
            df.columns = [x.lower() for x in df.columns]
        return df

    def execute_PCDS(self, query, service_name):
        """Execute PCDS PL/SQL block and parse DBMS_OUTPUT"""
        from oracledb import STRING, NUMBER
        query_stmt = query  # Cleaned query
        
        #>>> Connect and execute <<<#
        # with pcds_connect(service_name=service_name) as CONN:
        #     cursor = CONN.cursor()
        #     cursor.callproc("dbms_output.enable", [None])
        #     cursor.execute(query_stmt)
        #     ...
        # Parse output into dictionary
        raise NotImplementedError("Implement PCDS connection")

    def query_PCDS(self, query_stmt: str, service_name: str, **query_kwargs):
        """Execute query on PCDS"""
        # with pcds_connect(service_name=service_name) as CONN:
        #     return self.query(query_stmt, CONN, **query_kwargs)
        raise NotImplementedError("Implement PCDS connection")

    def query_AWS(self, query_stmt: str, **query_kwargs):
        """Execute query on AWS Athena"""
        import pyathena as pa
        aws_creds_renew()
        CONN = pa.connect(
            s3_staging_dir="s3://355538383407-us-east-1-athena-output/uscb-analytics/",
            region_name="us-east-1",
        )
        return self.query(query_stmt, CONN, **query_kwargs)

# Create global instances
proc_pcds = SQLengine('PCDS')
proc_aws = SQLengine('AWS')

## Cell 8: Column Comparator - Main Analysis Engine

In [None]:
#--- Parse date value with special handling for PCDS format ---#
def parse_date_value(x, in_pcds=False, window=20):
    if in_pcds and re.match(r'^\d{2}-\d{2}-\d{4} \d{2}:\d{2}:\d{2}$', x):
        date_part = pd.to_datetime(x.split(" ")[0], format='%d-%m-%Y', dayfirst=True)
        if date_part.year > TODAY.year + window:
            date_part = date_part.replace(year=date_part.year - 100)
        return date_part.strftime(AWS_DT_FORMAT)
    try:
        return pd.to_datetime(x).strftime(AWS_DT_FORMAT)
    except (AttributeError, ValueError, TypeError):
        return str(x) if not pd.isna(x) else x

class PsuedoLock:
    """Dummy lock for single-threaded execution"""
    def __enter__(self):
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        pass

class ColumnComparator:
    """Enhanced column comparison engine for PCDS and AWS data"""
    
    parallel = 'process'
    statistics = {
        'col_type': 'Type',
        'col_count': 'N_Total',
        'col_distinct': 'N_Unique',
        'col_missing': 'N_Missing',
        'col_max': 'Max',
        'col_min': 'Min',
        'col_avg': 'Mean',
        'col_std': 'Std',
        'col_sum': 'Sum',
        'col_sum_sq': 'Sum_Square',
        'col_freq': 'Frequency'
    }

    def __init__(self):
        self.comparison_results = {}

    def download_pcds_snapshot(self, info_str, columns, where_clause) -> pd.DataFrame:
        """Download PCDS table snapshot for backup"""
        try:
            proc_pcds = SQLengine('PCDS')
            service, table_name = info_str.split('.')
            columns = ', '.join(columns)
            sql_stmt = f"SELECT {columns} FROM {table_name} WHERE {where_clause}"
            logger.info(f"Downloading PCDS table {table_name}")
            return proc_pcds.query_PCDS(sql_stmt, service_name=service)
        except Exception as e:
            logger.error(f"Error in PCDS pulling: {e}")
            raise

    def run_pcds_column_analysis(self, info_str: str, where_clause: str = "1=1") -> pd.DataFrame:
        """Run comprehensive column statistics on PCDS table"""
        try:
            proc_pcds = SQLengine('PCDS')
            service, table_name = info_str.split('.')
            where_clause = where_clause.replace('\'', '\'\'')
            sql_stmt = PCDS_SQL_COLUMN.format(table=table_name, limit=where_clause)
            logger.info(f"Executing PCDS analysis for {table_name}")
            result = proc_pcds.execute_PCDS(sql_stmt, service_name=service)
            return pd.DataFrame(result).T
        except Exception as e:
            logger.error(f"Error in PCDS analysis: {e}")
            raise

    @staticmethod
    def run_aws_single(data_type, column_name, db, table, limit='', lock=None):
        """Run statistics on single AWS column"""
        continuous_types = ('tinyint', 'smallint', 'integer', 'bigint', 'float', 'double', 'decimal')
        
        #>>> Handle timestamp columns <<<#
        if re.match('^time', data_type, flags=re.I):
            column_name = 'CAST(%s AS DATE)' % column_name
        
        #>>> Choose appropriate SQL template <<<#
        is_continuous = any(ct in data_type.lower() for ct in continuous_types)
        if is_continuous:
            sql_template = AWS_Cont_SQL
        else:
            sql_template = AWS_Catg_SQL
        
        sql_stmt = sql_template.format(
            db=db, table=table, column_name=column_name, data_type=data_type, limit=limit
        )
        
        with lock:
            aws_creds_renew()
        
        proc_aws = SQLengine('AWS')
        return proc_aws.query_AWS(sql_stmt)

    def run_aws_column_analysis(self, info_str: str, columns_info: dict, where_clause: str = "1=1", n_jobs=None) -> pd.DataFrame:
        """Run column analysis on AWS table with optional parallelization"""
        results = {}
        db_name, table_name = info_str.split('.')
        logger.info(f"Executing AWS analysis for {table_name}")
        worker = ft.partial(self.run_aws_single, db=db_name, table=table_name, limit=where_clause)
        
        try:
            if n_jobs is None:
                #>>> Sequential execution <<<#
                locker = PsuedoLock()
                for col_name, data_type in tqdm(columns_info.items()):
                    results[col_name] = worker(data_type, col_name, lock=locker)
            else:
                #>>> Parallel execution <<<#
                if self.parallel == "thread":
                    executor_class = ThreadPoolExecutor
                    locker = td.Lock()
                elif self.parallel == "process":
                    executor_class = ft.partial(ProcessPoolExecutor, mp_context=mp.get_context('spawn'))
                    locker = mp.Manager().Lock()
                
                with executor_class(max_workers=n_jobs) as executor:
                    futures = {}
                    for col_name, data_type in tqdm(columns_info.items()):
                        futures[executor.submit(worker, data_type, col_name, lock=locker)] = col_name
                    
                    for future in tqdm(as_completed(futures), total=len(futures), desc='Processing ... '):
                        try:
                            col_name = futures[future]
                            results[col_name] = future.result()
                        except Exception as e:
                            logger.error(f"Task failed: {e}")
                    executor.shutdown()
            
            df = pd.concat(results.values(), keys=results).droplevel(1)
            return df
        except Exception as e:
            logger.error(f"Error in AWS analysis: {e}")
            raise

    @staticmethod
    def get_value(row: pd.Series, column: str, is_pcds: bool = False):
        """Extract and parse value from statistics row"""
        value = row.get(column, np.nan) or np.nan
        is_date = bool(re.match(r'^date|time', row['col_type'], re.I))
        
        #>>> Parse frequency distribution <<<#
        if column == 'col_freq' and isinstance(value, str):
            value = sorted([
                m.groups() for s in value.split('; ')
                if (m := re.search(r'([^(]*)\((\d+)\)', s.strip()))
            ], key=lambda x: (-int(x[1]), parse_date_value(x[0], is_pcds)))
        
        #>>> Try numeric conversion <<<#
        try:
            return float(value)
        except (ValueError, TypeError):
            try:
                return int(value)
            except (ValueError, TypeError):
                return value

    @staticmethod
    def contains_datelike(dtype1, dtype2):
        """Check if either type is date/timestamp"""
        return bool({dtype1.lower(), dtype2.lower()} & {'date', 'timestamp'})

    def compare_statistics(
        self,
        pcds_stats: pd.DataFrame,
        aws_stats: pd.DataFrame,
        column_mapping: dict[str, str],
        tokenised_cols: list
    ) -> dict[str, any]:
        """Compare statistics between PCDS and AWS"""
        mismatched_columns = set()
        mismatched_details = {}
        
        #>>> Align columns based on mapping <<<#
        aligned_pcds = pcds_stats.copy()
        aligned_aws = aws_stats.copy()
        
        aligned_pcds = (
            aligned_pcds
            .drop(index=tokenised_cols)
            .rename(index=column_mapping)
        )
        
        #>>> Compare common columns <<<#
        common_columns = set(aligned_pcds.index) & set(aligned_aws.index)
        for column in common_columns:
            pcds_row = aligned_pcds.loc[column]
            aws_row = aligned_aws.loc[column]
            column_diffs = {}
            has_mismatch = False
            
            for stat, name in self.statistics.items():
                if stat in ('col_type',):
                    continue
                
                #>>> Skip date frequency comparison <<<#
                if stat == 'col_freq' and self.contains_datelike(
                    pcds_row['col_type'], aws_row['col_type']
                ):
                    continue
                
                pcds_val = self.get_value(pcds_row, stat, True)
                aws_val = self.get_value(aws_row, stat)
                
                if self._values_different(pcds_val, aws_val):
                    column_diffs[name] = {'pcds': pcds_val, 'aws': aws_val}
                    has_mismatch = True
            
            if has_mismatch:
                mismatched_columns.add(column)
                mismatched_details[column] = column_diffs
                logger.warning(f"Mismatch found in column {column}: {column_diffs}")
        
        #>>> Format output <<<#
        pcds_stats = (
            pcds_stats.loc[list(column_mapping)]
            [list(self.statistics)]
            .rename(columns=self.statistics)
        )
        aws_stats = (
            aws_stats.loc[[v for k, v in column_mapping.items()]]
            [list(self.statistics)]
            .rename(columns=self.statistics)
        )
        
        results = {
            'mismatched_columns': mismatched_columns,
            'mismatched_details': mismatched_details,
            'total_columns': len(common_columns),
            'matched_columns': len(common_columns) - len(mismatched_columns),
        }
        return results, pcds_stats, aws_stats

    @staticmethod
    def _values_different(val1, val2) -> bool:
        """Check if two values are different with tolerance"""
        #>>> Handle list comparisons (frequency distributions) <<<#
        if isinstance(val1, list):
            flag = any(
                ColumnComparator._values_different(x1, x2)
                for t1, t2 in zip(val1, val2)
                for x1, x2 in zip(t1, t2)
            )
            return flag
        
        #>>> Handle NaN values <<<#
        if pd.isna(val1) and pd.isna(val2):
            return False
        if val1 == 0 and pd.isna(val2):
            return False
        if pd.isna(val1) ^ pd.isna(val2):
            return True
        
        #>>> Try date comparison <<<#
        try:
            dat1 = parse_date_value(val1, in_pcds=True)
            dat2 = parse_date_value(val2)
            return dat1 != dat2
        except (ValueError, TypeError):
            #>>> Try numeric comparison <<<#
            try:
                num1, num2 = float(val1), float(val2)
                return not np.isclose(num1, num2, atol=1e-6, rtol=1e-6)
            except (ValueError, TypeError):
                #>>> Fall back to string comparison <<<#
                return str(val1) != str(val2)

## Cell 9: Excel Report Generation

In [None]:
class XS:
    """Excel styling helper"""
    
    def __init__(self, ws: xw.Range):
        self.ws = ws

    def get_color(self, color):
        if isinstance(color, str):
            return get_rgb(color)
        return color

    def apply_styles(self, pos='A1', value='', font={}, align='left', color='', border={}):
        """Apply styles to a cell"""
        cell = self.make_cell(pos)
        if value:
            cell.value = value
        if font:
            if 'family' in font:
                cell.font.name = font['family']
            if 'size' in font:
                cell.font.size = font['size']
            if 'color' in font:
                cell.font.color = self.get_color(font['color'])
            if 'bold' in font:
                cell.font.bold = font['bold']
        if color:
            cell.color = self.get_color(color)
        if 'style' in border:
            cell.api.Borders.LineStyle = border['style']
        if align == 'right':
            cell.api.HorizontalAlignment = HAlign.xlHAlignRight
        elif align == 'left':
            cell.api.HorizontalAlignment = HAlign.xlHAlignLeft
        elif align == 'center':
            cell.api.HorizontalAlignment = HAlign.xlHAlignCenter

    def make_cell(self, pos='A1', value=None):
        cell = self.ws.range(pos)
        if value:
            cell.value = value
        return cell

    def write_dataframe(self, df: pd.DataFrame, pos='A1', header=True, index=False):
        cell = self.make_cell(pos)
        cell.options(index=index, header=header).value = df
        return cell

class ExcelReporter:
    """Excel reporter for column comparison results"""
    
    def __init__(self, workbook_path: str):
        self.workbook_path = UPath(workbook_path)
        self.app = None
        self.wb = None
        self.ns = -1
        self.cx, self.cy = None, None

    def __enter__(self):
        try:
            self.workbook_path.unlink(True)
        except PermissionError:
            xw.Book(self.workbook_path).close()
        self.app = xw.App(visible=True, add_book=False)
        self.wb = self.app.books.add()
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.wb:
            self.wb.save(str(self.workbook_path))

    def create_comparison_report(self, comparison_results: dict[str, dict[str, CSResult]]):
        """Create Excel report with comparison results"""
        self._create_summary_sheet(comparison_results)
        for dataset_name, dataset_data in comparison_results.items():
            self._create_dataset_sheet(dataset_name, dataset_data)

    def _create_summary_sheet(self, comparison_results):
        """Create summary sheet with overview"""
        ws = self.wb.sheets[0]
        ws.name = 'SUMMARY'
        ws.range('A1').value = 'Column Statistics Comparison'
        ws.range('A1').font.bold = True
        headers = ['Dataset', 'Vintage', 'Total Columns', 'Matched Columns', 'Mismatched Columns', 'Match Rate %']
        ws.range('A3').value = headers
        ws.range('A3:F3').font.bold = True
        ws.range('A3:F3').color = (200, 200, 200)
        
        row = 4
        for dataset_name, dataset_data in comparison_results.items():
            for vintage, data in dataset_data.items():
                pcds_stats = data.pcds_stats
                matched = (total_cols := len(pcds_stats)) - (mismatched := len(data.miss_columns))
                match_rate = (matched / total_cols * 100) if total_cols > 0 else 0
                ws.range(f'A{row}').value = [dataset_name, vintage, total_cols, matched, mismatched, f'{match_rate:.1f}%']
                if match_rate >= 95:
                    ws.range(f'A{row}:F{row}').color = (200, 255, 200)
                else:
                    ws.range(f'A{row}:F{row}').color = (255, 200, 200)
                row += 1
        ws.autofit()
        self.ns += 1

    def _create_dataset_sheet(self, name: str, result_d: dict[str, CSResult]):
        """Create detailed sheet for a specific dataset"""
        wb = self.wb
        try:
            ws = wb.sheets.add(name.upper(), after=wb.sheets[self.ns])
        except ValueError:
            ws = wb.sheets[name.upper()]
        finally:
            ws.clear()
        xs = XS(ws)
        
        row = 1
        for vintage, data in result_d.items():
            #>>> Write vintage header <<<#
            xs.make_cell(pos=f'A{row}', value='Vintage: ')
            xs.apply_styles(pos=f'B{row}', value=vintage, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(190, 190, 190))
            row += 2
            
            #>>> Write PCDS statistics <<<#
            pcds_tbl = data.meta_data.pcds_table.split('.')[-1]
            xs.make_cell(pos=f'A{row}', value='PCDS: ')
            xs.apply_styles(pos=f'B{row}', value=pcds_tbl, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(240, 240, 240))
            row += 1
            
            #>>> Reorder columns to show mismatches first <<<#
            aws_view = data.aws_stats.T.map(self._format_cell_value)
            indices = [i for i, x in enumerate(aws_view) if x in data.miss_columns]
            the_rest = [i for i in range(len(aws_view.columns)) if i not in indices]
            aws_view = aws_view[aws_view.columns[indices + the_rest]]
            pcds_view = data.pcds_stats.T.map(self._format_cell_value)
            pcds_view = pcds_view[pcds_view.columns[indices + the_rest]]
            
            self.cx, self.cy = 2, row + 1
            xs.write_dataframe(pcds_view, f'B{row}', index=True)
            
            #>>> Write AWS statistics <<<#
            row += len(pcds_view) + 2
            xs.make_cell(pos=f'A{row}', value='AWS: ')
            aws_tbl = data.meta_data.aws_table.lower()
            xs.apply_styles(pos=f'B{row}', value=aws_tbl, align='right')
            ws.range(f'B{row}:D{row}').merge()
            xs.apply_styles(f'A{row}:D{row}', font={'bold': True}, color=(240, 240, 240))
            row += 1
            xs.write_dataframe(aws_view, f'B{row}', index=True)
            row += len(aws_view) + 3
            
            self._highlight_differences(ws, nx=len(indices), ny=len(pcds_view) - 1)
            row += 2
        ws.autofit()

    def _highlight_differences(self, ws: xw.Sheet, nx: int, ny: int):
        """Highlight differences between PCDS and AWS"""
        ix, iy = self.cx, self.cy
        for i in range(iy, iy + ny):
            for j in range(ix, ix + nx):
                pcds, aws = ws[i, j], ws[i + ny + 4, j]
                pcds.number_format = '0.00'
                aws.number_format = '0.00'
                if pcds.value == aws.value:
                    pcds.font.color = get_rgb('green')
                    aws.font.color = get_rgb('green')
                else:
                    pcds.font.color = get_rgb('red')
                    aws.font.color = get_rgb('red')

    def _format_cell_value(self, value) -> str:
        """Format cell value for display"""
        if pd.isna(value):
            return ''
        elif isinstance(value, (int, float)):
            if isinstance(value, float) and value.is_integer():
                return str(int(value))
            return str(value)
        else:
            str_val = str(value)
            return str_val[:50] + '...' if len(str_val) > 50 else str_val

#--- Create comparison report from results dictionary ---#
def create_comparison_report(comparison_results: dict[dict[str, CSResult]], output_path: UPath):
    with ExcelReporter(output_path) as reporter:
        reporter.create_comparison_report(comparison_results)

## Cell 10: Main Execution Logic

In [None]:
def main():
    """Main execution function for column statistics analysis"""
    global record
    config = parse_config()
    C_out, C_in, C_cmp = config.output, config.input, config.compare
    logger.info('Configuration:\n' + str(config))
    
    #>>> Load environment and setup <<<#
    load_env(C_in.env)
    # start_setup(C_out)  # Implement based on needs
    start_run()
    IO.delete_file(C_out.csv.file)
    aws_creds_renew(15 * 60)
    
    #>>> Load metadata from previous meta analysis <<<#
    meta_json = IO.read_meta_json(C_in.json)
    meta_csv = pd.read_csv(C_in.csv)
    
    HAS_HEADER = False
    ALL_RESULT = {}
    CC = ColumnComparator()
    
    for i, row in tqdm(meta_csv.iterrows(), desc='Processing ...', total=len(meta_csv)):
        name = row.get('PCDS Table Details with DB Name')
        logger.info(f"Processing dataset: {name}")
        
        #>>> Load metadata for this table <<<#
        meta_info = meta_json.get(name)
        meta_pcds, meta_aws = meta_info.pcds, meta_info.aws
        
        #>>> Remove PII and tokenized columns <<<#
        avai_cols = [x for x in meta_pcds.col2COL if x not in C_cmp.exclude]
        meta_pcds.update(
            col2COL={k: meta_pcds.col2COL[k] for k in avai_cols},
            col2type={k: meta_pcds.col2type[k] for k in avai_cols}
        )
        meta_aws.update(
            col2COL={v: k for k, v in meta_pcds.col2COL.items()},
            col2type={
                k: v for k, v in meta_aws.col2type.items()
                if k in meta_pcds.col2COL.values()
            }
        )
        
        DATA_RESULT = {}
        record = SQLRecord(name=name)
        
        #>>> Determine time partitions to analyze <<<#
        if (partition := meta_info.partition) == 'empty':
            continue
        elif partition in get_args(TPartition):
            vintages = get_vintages_from_data(
                info_str=meta_aws.infostr,
                date_var=meta_aws.rowvar,
                date_type=meta_aws.rowtype,
                date_format='%y-%m-%d',
                partition_type=partition,
                where_clause=meta_aws.where
            )
        else:
            vintages = ['entire_dataset']
        
        #>>> Process each vintage/time period <<<#
        for vintage in vintages:
            logger.info(f"Processing vintage: {vintage}")
            
            #>>> Build WHERE clauses for this vintage <<<#
            pcds_where = get_pcds_where(
                date_var=meta_pcds.rowvar,
                date_type=meta_pcds.rowtype,
                date_partition=partition,
                date_range=vintage,
                date_format='YYYY-MM-DD',
                snapshot=partition == 'snapshot',
                exclude_clauses=[meta_pcds.where, meta_pcds.rowexclude]
            )
            
            rowvar = meta_aws.rowvar
            aws_where = get_aws_where(
                date_var=rowvar,
                date_type=meta_aws.rowtype,
                date_partition=partition,
                date_range='%s=%s' % (re.sub(r"\s*\(.*?\)$", "", rowvar), vintage) if vintage != 'entire_dataset' else vintage,
                date_format='%Y%m%d',
                snapshot=partition == 'snapshot',
                exclude_clauses=[meta_aws.where, meta_aws.rowexclude]
            )
            
            os.environ['SKIP_SAVE'] = 'N'
            if os.environ.get('SKIP_SNAPSHOT') and partition == 'snapshot':
                os.environ['SKIP_SAVE'] = 'Y'
            
            #>>> Compute column statistics <<<#
            with Timer() as timer:
                pcds_stats = CC.run_pcds_column_analysis(
                    meta_pcds.infostr, pcds_where
                )
                pcds_time = timer.pause()
            
            with Timer() as timer:
                aws_stats = CC.run_aws_column_analysis(
                    meta_aws.infostr, meta_aws.col2type, aws_where, C_cmp.n_process
                )
                aws_time = timer.pause()
            
            if is_stat_empty(pcds_stats) or is_stat_empty(aws_stats):
                continue
            
            #>>> Compare statistics <<<#
            comparison_result, pcds_stats, aws_stats = CC.compare_statistics(
                pcds_stats, aws_stats, meta_pcds.col2COL, meta_info.tokenised_cols
            )
            
            pcds_stats = pcds_stats.loc[list(meta_pcds.col2COL)]
            aws_stats = aws_stats.loc[[v for _, v in meta_pcds.col2COL.items()]]
            
            DATA_RESULT[vintage] = CSResult(
                pcds_stats=pcds_stats,
                aws_stats=aws_stats,
                miss_columns=comparison_result['mismatched_columns'],
                miss_details=comparison_result['mismatched_details'],
                meta_data=CSMeta(
                    pcds_table=meta_pcds.infostr,
                    aws_table=meta_aws.infostr,
                    partition=partition,
                    vintage=vintage,
                    pcds_time=pcds_time,
                    aws_time=aws_time,
                )
            )
            
            #>>> Upload PCDS snapshot if needed <<<#
            if partition == 'snapshot' and len(comparison_result['mismatched_columns']) > 0:
                df_pcds = CC.download_pcds_snapshot(
                    meta_pcds.infostr,
                    columns=[meta_aws.col2COL[x] for x in comparison_result['mismatched_columns']],
                    where_clause=pcds_where
                )
                S3.upload(df_pcds, 's3://355538383407-edpss/pcds_tables/PCDS_{tbl}_snap/{part}.pq'.format(
                    tbl=name.upper(), part='today=%s' % TODAY.strftime(AWS_DT_FORMAT)
                ))
                logger.info(f"Uploading dataset with issues to S3: {name}")
            
            record.update(
                unmatched=comparison_result['mismatched_columns'],
                nrow=int(pcds_stats['N_Total'].max()),
                ncol=comparison_result['total_columns'],
                pcds_time=pcds_time,
                aws_time=aws_time
            )
            logger.info(f"Vintage {vintage}: {comparison_result['matched_columns']}/{comparison_result['total_columns']} columns matched")
        
        ALL_RESULT[name] = DATA_RESULT
        
        #>>> Write results to CSV <<<#
        with open(C_out.csv.file, 'a+', newline='') as fp:
            writer = csv.DictWriter(fp, fieldnames=C_out.csv.columns)
            if not HAS_HEADER:
                writer.writeheader()
                HAS_HEADER = True
            row_dict = {k: v for k, v in row.items() if k in C_out.csv.columns}
            writer.writerow({**record.toJSON(), **row_dict})
        
        #>>> Reset engines <<<#
        proc_aws.reset()
        proc_pcds.reset()
    
    #>>> Save results to JSON <<<#
    IO.write_json(C_out.json, {
        dataset: {
            vintage: {
                'mismatched_columns': list(data.miss_columns),
                'mismatched_details': data.miss_details,
                'metadata': data.meta_data.todict()
            } for vintage, data in vintages.items()
        } for dataset, vintages in ALL_RESULT.items()
    })
    
    #>>> Create Excel report <<<#
    create_comparison_report(ALL_RESULT, C_out.xlsx)
    logger.info(f"Excel report created: {C_out.xlsx}")
    
    end_run()

if __name__ == '__main__':
    main()

## Run the Analysis

Uncomment the cell below to run the full analysis:

In [None]:
# main()