# Meta Analysis Step 3: Comparison and Aggregation

This notebook compares PCDS and AWS metadata and generates final reports.

**Purpose:**
- Load results from Step 1 (PCDS) and Step 2 (AWS)
- Compare column mappings and data types
- Identify mismatches in row counts and date ranges
- Generate comprehensive CSV report
- Create JSON output for column_statistics step
- Upload results to S3 (optional)

**Inputs:**
- `pcds_meta_results.pkl` - PCDS metadata from Step 1
- `aws_meta_results.pkl` - AWS metadata from Step 2

**Outputs:**
- CSV report with comparison results
- JSON file for next pipeline step
- S3 upload (optional)

**Note:** This step only performs comparisons and does not connect to any database.

## Cell 1: Import Required Libraries

In [None]:
import re
import os
import csv
import json
import shutil
import pickle
import argparse
import warnings
import numpy as np
import pandas as pd
import awswrangler as aws
import boto3

from upath import UPath
from loguru import logger
from tqdm import tqdm
from datetime import datetime, timedelta, timezone
from dataclasses import dataclass, field
from configparser import ConfigParser
from confection import Config
from unittest import mock
from enum import Enum
from typing import Literal, Dict, List
from collections import defaultdict, abc

warnings.filterwarnings("ignore", category=UserWarning, message='.*pandas only supports SQLAlchemy connectable.*')

# Note: This notebook uses Parquet format for cross-platform compatibility
# Install pyarrow if needed: pip install pyarrow or conda install -c conda-forge pyarrow
import pyarrow

## Cell 2: Constants and Configuration

In [None]:
# --- Global Constants ---
SEP = '; '
WIDTH = 80
NO_DATE = 'no_date_provided'
SESSION = None

class PullStatus(Enum):
    """Enumeration for data pull status codes"""
    NONEXIST_PCDS = 'Nonexisting PCDS Table'
    NONEXIST_AWS = 'Nonexisting AWS Table'
    NONDATE_PCDS = 'Nonexisting Date Variable in PCDS'
    NONDATE_AWS = 'Nonexisting Date Variable in AWS'
    EMPTY_PCDS = 'Empty PCDS Table'
    EMPTY_AWS = 'Empty AWS Table'
    NO_MAPPING = 'Column Mapping Not Provided'
    SUCCESS = 'Successful Data Access'

## Cell 3: Data Types and Helper Functions

In [None]:
# --- Helper Functions ---
def read_str_lst(lst_str, sep='\n'):
    """Parse newline-separated string into list"""
    return [x for x in lst_str.strip().split(sep) if x]

def read_dstr_lst(dct_str, sep='='):
    """Parse key=value pairs into dictionary"""
    d = dict(line.split(sep, 1) for line in read_str_lst(dct_str))
    return {k.strip(): v.strip() for k, v in d.items()}

@dataclass
class MetaMerge:
    """Results from merging PCDS and AWS column metadata"""
    unique_pcds: list
    unique_aws: list
    col_mapping: pd.DataFrame
    mismatches: str
    uncaptured: str

@dataclass
class MetaMatch:
    """Column matching configuration"""
    candidates: str
    drop_cols: dict
    add_cols: dict
    
    def __post_init__(self):
        self.candidates = read_str_lst(self.candidates)
        self.drop_cols = list(self.drop_cols)
        self.add_cols = list(self.add_cols)

#--- Start logging session with separator ---#
def start_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- End logging session with separator ---#
def end_run():
    logger.info('\n\n' + '=' * WIDTH)

## Cell 4: Utility Classes

In [None]:
class IO:
    """File I/O utility class - uses Parquet/JSON for cross-platform compatibility"""

    @staticmethod
    def write_dataframe(file, df):
        """Save DataFrame in portable Parquet format"""
        file = UPath(file)
        df.to_parquet(file, index=True, engine='pyarrow', compression='snappy')

    @staticmethod
    def read_dataframe(file):
        """Load DataFrame from Parquet format"""
        file = UPath(file)
        return pd.read_parquet(file, engine='pyarrow')

    @staticmethod
    def write_json(file, data, cls=None):
        """Save to JSON with proper serialization"""
        import numpy as np
        import pandas as pd
        import datetime as dt

        def convert(obj):
            if isinstance(obj, (np.integer, np.floating)):
                return obj.item()
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif pd.isna(obj):
                return None
            elif isinstance(obj, (dt.datetime, dt.date)):
                return obj.isoformat()
            elif isinstance(obj, set):
                return list(obj)
            raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

        with open(file, 'w') as f:
            json.dump(data, f, indent=2, default=convert, cls=cls)

    @staticmethod
    def read_json(file):
        """Read JSON file into dictionary"""
        with open(file, 'r') as fp:
            data = json.load(fp)
        return data

    @staticmethod
    def write_pickle(file, data):
        """Deprecated: Use write_dataframe or write_json instead"""
        with open(file, 'wb') as f:
            pickle.dump(data, f)

    @staticmethod
    def read_pickle(file):
        """Deprecated: Use read_dataframe or read_json instead"""
        with open(file, 'rb') as fp:
            data = pickle.load(fp)
        return data

    @staticmethod
    def delete_file(file):
        """Delete file if it exists"""
        if (filepath := UPath(file)).exists():
            filepath.unlink()

class UDict(dict):
    """Case-insensitive dictionary for flexible key matching"""

    def __getitem__(self, key):
        return super().__getitem__(self._match(key))

    def __contains__(self, key):
        try:
            self._match(key)
            return True
        except KeyError:
            return False

    def _match(self, key):
        """Find matching key regardless of case"""
        for k in self:
            if k.lower() == key.lower():
                return k
        raise KeyError(key)

    def update(self, other=None, **kwargs):
        if other is not None:
            for k, v in other.items() if isinstance(other, abc.Mapping) else other:
                self[k] = v
        for k, v in kwargs.items():
            assert self._match(k)
            self[k] = v

    def get(self, key, default_value=None):
        try:
            return self[key]
        except KeyError:
            return default_value

class Misc:
    """Miscellaneous utility functions"""

    @staticmethod
    def remove_items(input_str, delete_lst):
        """Remove specific items from semicolon-separated string"""
        pattern = '|'.join(r'\b%s\b;?\s?' % x for x in delete_lst)
        return re.sub(pattern, '', input_str).rstrip('; ')

    @staticmethod
    def prefix(a, b):
        """Check if either string is prefix of the other"""
        return a.startswith(b) or b.startswith(a)

    @staticmethod
    def common(a, b, use_prefix=False):
        """Find common items between two lists with optional prefix matching"""
        def prefix_cmp(a, b):
            return a.startswith(b) or b.startswith(a)

        result, visited = {}, set()
        prefix_d = defaultdict(list)

        #>>> Build prefix matching dictionary <<<#
        for x, y in [(x, y) for x in a for y in b]:
            if prefix_cmp(x, y):
                prefix_d[x].append(y)

        #>>> Prioritize exact matches <<<#
        for x in a:
            if x in b and x not in visited:
                result[x] = x
                visited.add(x)

        #>>> Handle prefix matches for remaining items <<<#
        for x in a:
            if x in result and (not use_prefix):
                continue
            for y in prefix_d[x]:
                if y not in visited:
                    result[x] = y
                    visited.add(y)
        return result

    @staticmethod
    def convert2int(a):
        """Safely convert value to integer"""
        try:
            return int(a)
        except (TypeError, ValueError):
            return None

    @staticmethod
    def convert2datestr(a):
        """Convert datetime to string format"""
        if isinstance(a, datetime):
            return a.strftime('%Y-%m-%d')
        return a

## Cell 5: S3 Utilities

In [None]:
class S3:
    """AWS S3 utility functions"""
    
    @staticmethod
    def upload_multiple(s3_url, folder, prefix=''):
        """Upload multiple files from folder to S3"""
        folder, s3_url = UPath(folder), UPath(s3_url)
        for file in folder.glob('%s.*' % prefix):
            aws.s3.upload(
                local_file=file.as_posix(),
                path=s3_url.joinpath(file.name).as_posix(),
                boto3_session=SESSION
            )
            logger.info(f"Uploading {file.name} to {s3_url} [finished]")

## Cell 6: Data Type Mapping Functions

In [None]:
#--- Check if PCDS and AWS data types are compatible ---#
def map_pcds_aws(row):
    aws_dtype = row.data_type_aws
    match (pcds_dtype := row.data_type_pcds):
        case 'NUMBER':
            ok_1 = aws_dtype == 'double'
            return ok_1
        case _ if pcds_dtype.startswith('NUMBER'):
            y1 = re.match(r'NUMBER\(\d*,(\d+)\)', pcds_dtype).group(1)
            match = re.match(r'decimal\(\d*,(\d+)\)', aws_dtype)
            return bool(match and match.group(1) == y1)
        case _ if pcds_dtype.startswith('VARCHAR2'):
            return pcds_dtype.replace('VARCHAR2', 'varchar') == aws_dtype
        case _ if pcds_dtype.startswith('CHAR'):
            n = re.match(r'CHAR\((\d+)\)', pcds_dtype).group(1)
            return not (aws_dtype.startswith('VARCHAR') and n != 1)
        case 'DATE':
            ok_1 = aws_dtype == 'date'
            ok_2 = aws_dtype.startswith('timestamp')
            return ok_1 | ok_2
        case _ if pcds_dtype.startswith('TIMESTAMP'):
            return aws_dtype.startswith('timestamp')
        case _:
            s = ">>> Mismatched type on {}\n\tPCDS ({}) ==> AWS ({})"
            logger.info(s.format(row.column_name_aws, pcds_dtype, aws_dtype))
            return False

## Cell 7: Metadata Comparison Functions

In [None]:
#--- Merge and compare PCDS and AWS column metadata ---#
def process_merge(pcds: pd.DataFrame, aws: pd.DataFrame, tokenised_cols: list) -> MetaMerge:
    """
    Check column mapping and variable typing differences
    Returns unique columns, type mismatches, and uncaptured mappings
    """
    #>>> Find columns without documented mappings <<<#
    unmapped_pcds = (
        pcds.query('aws_colname != aws_colname')
        ['column_name'].str.lower().to_list()
    )
    unmapped_aws = (
        aws.query('~column_name.isin(@pcds.aws_colname)')
        ['column_name'].to_list()
    )
    
    #>>> Use substring matching to find undocumented pairs <<<#
    map_uncaptured = Misc.common(unmapped_pcds, unmapped_aws)
    map_uncaptured = {
        k.upper(): v for k, v in map_uncaptured.items()
        if k not in tokenised_cols
    }
    uncaptured = SEP.join('{}->{}'.format(k, v) for k, v in map_uncaptured.items())

    #>>> Update column mappings with discovered pairs <<<#
    pcds['aws_colname'] = (
        pcds['aws_colname']
        .combine_first(pcds['column_name'].map(map_uncaptured))
    )
    
    #>>> Merge PCDS and AWS metadata <<<#
    df_match = pd.merge(
        left=pcds, right=aws,
        left_on='aws_colname', right_on='column_name',
        suffixes=['_pcds', '_aws'],
        how='outer', indicator=True
    )
    
    #>>> Separate unique columns from each platform <<<#
    pcds_cols = ['column_name_pcds', 'data_type_pcds']
    pcds_unique = df_match.query('_merge == "left_only"')[pcds_cols]
    aws_cols = ['column_name_aws', 'data_type_aws']
    aws_unique = df_match.query('_merge == "right_only"')[aws_cols]

    #>>> Check data type compatibility for matched columns <<<#
    merged = (
        df_match.query('_merge == "both"')
        .drop(columns=['aws_colname', '_merge'])
    )
    merged['type_match'] = merged.apply(map_pcds_aws, axis=1)
    mismatch_d = (
        merged.query('~type_match')
        [['data_type_pcds', 'data_type_aws']]
        .drop_duplicates()
    )
    mismatched = SEP.join('{}->{}'.format(*x[1:]) for x in mismatch_d.itertuples())

    #>>> Filter out previously known unique columns <<<#
    unmapped_pcds = pcds_unique['column_name_pcds'].str.upper().to_list()
    unmapped_aws = aws_unique['column_name_aws'].str.lower().to_list()

    return MetaMerge(
        unique_pcds=unmapped_pcds,
        unique_aws=unmapped_aws,
        col_mapping=merged,
        mismatches=mismatched,
        uncaptured=uncaptured
    )

#--- Compare column mappings and total records between PCDS and AWS ---#
def process_meta(pcds_t: dict, aws_t: dict, tokenised_cols: list) -> dict:
    uncaptured = ""
    pcds_c, aws_c = pcds_t['column'], aws_t['column']
    
    #>>> Handle missing column mapping <<<#
    if 'aws_colname' not in pcds_c.columns or pcds_c['aws_colname'].isna().all():
        pcds_c['aws_colname'] = pcds_c['column_name'].str.lower()
        uncaptured = "Column Mapping Not Provided"
    
    profile = process_merge(pcds_c, aws_c, tokenised_cols)
    logger.info(">>> Finish Merging Type Data")

    #>>> Prepare comparison results <<<#
    pcds_nrows = int(pcds_t['row'].iloc[0, 0])
    aws_nrows = int(aws_t['row'].iloc[0, 0])
    
    return {
        'Row UnMatch': pcds_nrows != aws_nrows,
        'Row UnMatch Details': f"PCDS({pcds_nrows}) : AWS({aws_nrows})",
        'Type UnMatch Details': profile.mismatches,
        'Column Type UnMatch': len(profile.mismatches) > 0,
        'PCDS Extra Columns': len(profile.unique_pcds) > 0,
        'PCDS Unique Columns': SEP.join(profile.unique_pcds),
        'AWS Extra Columns': len(profile.unique_aws) > 0,
        'AWS Unique Columns': SEP.join(profile.unique_aws),
        'Uncaptured Column Mappings': uncaptured or profile.uncaptured,
        'col_mapping': profile.col_mapping
    }

#--- Identify specific dates with row count discrepancies ---#
def process_date(cnt_pcds: pd.DataFrame, cnt_aws: pd.DataFrame, pcds_dateraw: str, aws_dateraw: str):
    def get_date(a, b):
        """Return first non-null date"""
        return b if pd.isna(a) else a

    def get_detailed_mismatch():
        """Format mismatch details"""
        a, _, b, _ = time_mismatch.columns
        return '; '.join(
            f"{get_date(r[a], r[b])} ({r['nrows_pcds']} : {Misc.convert2int(r['nrows_aws'])})"
            for _, r in time_mismatch.iterrows()
        )

    def get_time_excludes_sql():
        """Build SQL to exclude problematic dates"""
        pcds_col = pcds_dateraw.upper()
        aws_col = aws_dateraw.lower()
        exclude = ','.join("'%s'" % x for x in time_mismatch[pcds_col].fillna(time_mismatch[aws_col]) if x)
        return {
            'pcds_exclude': f'{pcds_col} not in ({exclude})',
            'aws_exclude': f'{aws_col} not in ({exclude})',
        }

    pcds_dt = pcds_dateraw.upper()
    aws_dt = aws_dateraw.lower()
    
    # Rename columns for consistency
    if 'NROWS' in cnt_pcds.columns:
        cnt_pcds = cnt_pcds.rename(columns={'NROWS': 'nrows_pcds'})
    else:
        cnt_pcds = cnt_pcds.rename(columns={'nrows': 'nrows_pcds'})
    
    cnt_aws = cnt_aws.rename(columns={'nrows': 'nrows_aws'})
    
    #>>> Merge date-wise row counts <<<#
    df_all = pd.merge(
        left=cnt_pcds,
        right=cnt_aws,
        left_on=pcds_dt,
        right_on=aws_dt,
        suffixes=['_pcds', '_aws'],
        how='outer'
    )
    
    time_mismatch = df_all.query('nrows_pcds != nrows_aws')
    logger.warning("Out of {} days to compare, issues are found on {} days".format(
        len(cnt_aws), len(time_mismatch)
    ))
    
    return {
        'Time Span UnMatch': len(time_mismatch) > 0,
        'Time Span Variable': f'{pcds_dt} : {aws_dt}',
        'Time UnMatch Details (PCDS : AWS)': get_detailed_mismatch(),
        'time_excludes': get_time_excludes_sql() if len(time_mismatch) > 0 else {}
    }

## Cell 8: Configuration and Setup

In [None]:
#--- Configuration paths (modify as needed) ---#
OUTPUT_FOLDER = UPath('files/outputs/meta_analysis')
CSV_OUTPUT = OUTPUT_FOLDER / 'meta_analysis_comparison.csv'
JSON_OUTPUT = OUTPUT_FOLDER / 'meta_analysis_results.json'
S3_UPLOAD_ENABLED = False
S3_OUTPUT_PATH = 's3://your-bucket/meta_analysis/'

# CSV column headers
CSV_COLUMNS = [
    'Consumer Loans Data Product',
    'PCDS Table Details with DB Name',
    'Tables delivered in AWS with DB Name',
    'Status',
    'Row UnMatch',
    'Row UnMatch Details',
    'Time Span UnMatch',
    'Time Span Variable',
    'Time UnMatch Details (PCDS : AWS)',
    'Column Type UnMatch',
    'Type UnMatch Details',
    'PCDS Extra Columns',
    'PCDS Unique Columns',
    'AWS Extra Columns',
    'AWS Unique Columns',
    'Uncaptured Column Mappings',
]

# Column filtering for cleanup
DROP_COLS = []  # Columns to remove from PCDS unique list
ADD_COLS = []   # Columns to remove from AWS unique list

## Cell 9: Main Comparison Logic

In [None]:
def main():
    """Main execution function for comparison and aggregation"""

    #>>> Setup logging <<<#
    OUTPUT_FOLDER.mkdir(parents=True, exist_ok=True)
    log_file = OUTPUT_FOLDER / 'meta_analysis_3_compare.log'
    logger.add(log_file, level='INFO', mode='w')

    start_run()
    logger.info('Starting Meta Analysis Step 3: Comparison and Aggregation')

    #>>> Load summary files from Steps 1 and 2 <<<#
    pcds_summary_path = OUTPUT_FOLDER / 'pcds_summary.json'
    aws_summary_path = OUTPUT_FOLDER / 'aws_summary.json'

    if not pcds_summary_path.exists():
        logger.error(f"PCDS summary not found: {pcds_summary_path}")
        logger.error("Please run Step 1 (meta_analysis_1_pcds.ipynb) first!")
        return

    if not aws_summary_path.exists():
        logger.error(f"AWS summary not found: {aws_summary_path}")
        logger.error("Please run Step 2 (meta_analysis_2_aws.ipynb) first!")
        return

    pcds_summary = IO.read_json(pcds_summary_path)
    aws_summary = IO.read_json(aws_summary_path)

    logger.info(f"Loaded PCDS summary for {len(pcds_summary)} tables")
    logger.info(f"Loaded AWS summary for {len(aws_summary)} tables")

    #>>> Load metadata from both steps <<<#
    pcds_metadata = IO.read_json(OUTPUT_FOLDER / 'pcds_metadata.json')
    aws_metadata = IO.read_json(OUTPUT_FOLDER / 'aws_metadata.json')

    #>>> Initialize output structures <<<#
    csv_rows = []
    json_output = {}

    #>>> Process each table <<<#
    common_tables = set(pcds_summary.keys()) & set(aws_summary.keys())
    logger.info(f"Found {len(common_tables)} tables in both PCDS and AWS")

    for table_name in tqdm(sorted(common_tables), desc='Comparing tables'):
        logger.info(f"\n>>> Processing {table_name}")

        pcds_info = pcds_summary[table_name]
        aws_info = aws_summary[table_name]
        pcds_meta = pcds_metadata.get(table_name, {})
        aws_meta = aws_metadata.get(table_name, {})

        #>>> Initialize result row <<<#
        row_result = {
            'Consumer Loans Data Product': pcds_meta.get('group', 'N/A'),
            'PCDS Table Details with DB Name': table_name,
            'Tables delivered in AWS with DB Name': pcds_meta.get('aws_tbl', 'N/A'),
            'Status': pcds_info.get('status', 'Unknown'),
            'Row UnMatch': False,
            'Row UnMatch Details': '',
            'Time Span UnMatch': False,
            'Time Span Variable': f"{pcds_meta.get('pcds_dt', 'N/A')} : {pcds_meta.get('aws_dt', 'N/A')}",
            'Time UnMatch Details (PCDS : AWS)': '',
            'Column Type UnMatch': False,
            'Type UnMatch Details': '',
            'PCDS Extra Columns': False,
            'PCDS Unique Columns': '',
            'AWS Extra Columns': False,
            'AWS Unique Columns': '',
            'Uncaptured Column Mappings': '',
        }

        #>>> Load parquet files <<<#
        pcds_col_file = UPath(pcds_info['column_file'])
        aws_col_file = UPath(aws_info['column_file'])

        if not pcds_col_file.exists() or not aws_col_file.exists():
            logger.warning(f"Missing column files for {table_name}")
            csv_rows.append(row_result)
            continue

        pcds_c = IO.read_dataframe(pcds_col_file)
        aws_c = IO.read_dataframe(aws_col_file)

        #>>> Perform metadata comparison <<<#
        try:
            tokenised_cols = pcds_meta.get('tokenised_cols', [])

            # Build dict from metadata
            pcds_t = {'column': pcds_c, 'row': pd.DataFrame([{'nrow': pcds_meta.get('pcds_nrows', 0)}])}
            aws_t = {'column': aws_c, 'row': pd.DataFrame([{'nrow': aws_meta.get('aws_nrows', 0)}])}

            meta_result = process_meta(pcds_t, aws_t, tokenised_cols)
            row_result.update(meta_result)

            #>>> Prepare data for next step <<<#
            col_mapping_df = meta_result.pop('col_mapping', pd.DataFrame())
            if not col_mapping_df.empty:
                d = (
                    col_mapping_df
                    .drop(columns='type_match')
                    .apply(lambda x: SEP.join(x.astype(str).tolist()), axis=0)
                    .to_dict()
                )
                json_output[table_name] = {
                    **pcds_meta,
                    **aws_meta,
                    'pcds_cols': d.get('column_name_pcds', ''),
                    'pcds_types': d.get('data_type_pcds', ''),
                    'aws_cols': d.get('column_name_aws', ''),
                    'aws_types': d.get('data_type_aws', ''),
                }
        except Exception as e:
            logger.error(f"Error processing metadata for {table_name}: {e}")
            csv_rows.append(row_result)
            continue

        #>>> Compare date-wise row counts if available <<<#
        if row_result['Row UnMatch']:
            pcds_date_file = UPath(pcds_info['date_file'])
            aws_date_file = UPath(aws_info['date_file'])

            if pcds_date_file.exists() and aws_date_file.exists():
                try:
                    pcds_d = IO.read_dataframe(pcds_date_file)
                    aws_d = IO.read_dataframe(aws_date_file)

                    # Load SQL engine info
                    pcds_engine = IO.read_json(UPath(pcds_info['meta_file']))['sql_engine']
                    aws_engine = IO.read_json(UPath(aws_info['meta_file']))['sql_engine']

                    date_result = process_date(pcds_d, aws_d, pcds_engine['dateraw'], aws_engine['dateraw'])
                    row_result.update(date_result)

                    # Add time exclusions to JSON output
                    if table_name in json_output:
                        json_output[table_name].update(date_result.get('time_excludes', {}))
                except Exception as e:
                    logger.error(f"Error processing dates for {table_name}: {e}")

        #>>> Clean up column lists <<<#
        row_result['PCDS Unique Columns'] = Misc.remove_items(
            row_result['PCDS Unique Columns'], DROP_COLS
        )
        row_result['AWS Unique Columns'] = Misc.remove_items(
            row_result['AWS Unique Columns'], ADD_COLS
        )

        csv_rows.append(row_result)
        logger.info(f"Finished processing {table_name}")

    #>>> Write CSV report <<<#
    logger.info(f"\nWriting CSV report to {CSV_OUTPUT}")
    with open(CSV_OUTPUT, 'w', newline='') as fp:
        writer = csv.DictWriter(fp, fieldnames=CSV_COLUMNS)
        writer.writeheader()
        writer.writerows(csv_rows)

    logger.info(f"CSV report written with {len(csv_rows)} rows")

    #>>> Write JSON output for next step <<<#
    logger.info(f"Writing JSON output to {JSON_OUTPUT}")
    IO.write_json(JSON_OUTPUT, json_output)
    logger.info(f"JSON output written with {len(json_output)} tables")

    #>>> Upload to S3 if enabled <<<#
    if S3_UPLOAD_ENABLED:
        logger.info("Uploading results to S3...")
        try:
            S3.upload_multiple(
                s3_url=S3_OUTPUT_PATH,
                folder=OUTPUT_FOLDER,
                prefix='meta_analysis'
            )
            logger.info("S3 upload completed")
        except Exception as e:
            logger.error(f"S3 upload failed: {e}")

    #>>> Summary statistics <<<#
    logger.info("\n" + "=" * WIDTH)
    logger.info("Summary Statistics:")
    logger.info(f"  Total tables compared: {len(csv_rows)}")

    row_mismatches = sum(1 for r in csv_rows if r['Row UnMatch'])
    type_mismatches = sum(1 for r in csv_rows if r['Column Type UnMatch'])
    time_mismatches = sum(1 for r in csv_rows if r['Time Span UnMatch'])

    logger.info(f"  Tables with row count mismatches: {row_mismatches}")
    logger.info(f"  Tables with type mismatches: {type_mismatches}")
    logger.info(f"  Tables with date range mismatches: {time_mismatches}")

    end_run()
    logger.info("Meta Analysis Step 3 completed successfully!")

if __name__ == '__main__':
    main()

## Run the Analysis

Uncomment the cell below to run the comparison:

In [None]:
# main()