# Meta Analysis - Table Comparison Between PCDS and AWS

This notebook performs meta-level analysis comparing tables between PCDS (Oracle) and AWS (Athena) databases.
It checks:
- Column mappings and data types
- Row counts and date ranges
- Schema differences

## Cell 1: Import Required Libraries

In [None]:
import re
import os
import csv
import json
import shutil
import pickle
import argparse
import warnings
import numpy as np
import pandas as pd
import pyathena as pa
import pandas.io.sql as psql
import awswrangler as aws
import boto3

from upath import UPath
from loguru import logger
from tqdm import tqdm
from datetime import datetime, timedelta, timezone
from dataclasses import dataclass, field, fields, is_dataclass
from configparser import ConfigParser
from confection import Config
from unittest import mock
from enum import Enum
from typing import Literal, Dict, List
from collections import defaultdict, abc
from dotenv import load_dotenv

warnings.filterwarnings("ignore", category=UserWarning, message='.*pandas only supports SQLAlchemy connectable.*')

## Cell 2: Constants and Configuration

In [None]:
# --- Global Constants ---
SEP = '; '
PCDS_DT_FORMAT = 'YYYY-MM-DD'
AWS_DT_FORMAT = '%Y-%m-%d'
TODAY = datetime.now()
ONEDAY = timedelta(days=1)
WIDTH = 80
NO_DATE = 'no_date_provided'
inWindows = os.name == 'nt'
SESSION = None

class PullStatus(Enum):
    """Enumeration for data pull status codes"""
    NONEXIST_PCDS = 'Nonexisting PCDS Table'
    NONEXIST_AWS = 'Nonexisting AWS Table'
    NONDATE_PCDS = 'Nonexisting Date Variable in PCDS'
    NONDATE_AWS = 'Nonexisting Date Variable in AWS'
    EMPTY_PCDS = 'Empty PCDS Table'
    EMPTY_AWS = 'Empty AWS Table'
    NO_MAPPING = 'Column Mapping Not Provided'
    SUCCESS = 'Successful Data Access'

# --- SQL Templates for PCDS (Oracle) ---
PCDS_SQL_META = """
select
    column_name,
    data_type || case
    when data_type = 'NUMBER' then 
        case when data_precision is NULL AND data_scale is NULL
            then NULL
        else
            '(' || TO_CHAR(data_precision) || ',' || TO_CHAR(data_scale) || ')'
        end
    when data_type LIKE '%CHAR%'
        then
            '(' || TO_CHAR(data_length) || ')'
        else NULL
    end AS data_type
from all_tab_cols
where table_name = UPPER('{table}')
order by column_id
"""

PCDS_SQL_NROW = """
SELECT COUNT(*) AS nrow FROM {table}
where {limit}
"""

PCDS_SQL_DATE = """
SELECT {date}, count(*) AS NROWS
FROM {table} 
WHERE {limit}
GROUP BY {date}
"""

# --- SQL Templates for AWS (Athena) ---
AWS_SQL_META = """
select column_name, data_type from information_schema.columns
where table_schema = LOWER('{db}') and table_name = LOWER('{table}')
"""

AWS_SQL_NROW = """
SELECT COUNT(*) AS nrow FROM {db}.{table}
where {limit}
"""

AWS_SQL_DATE = """
SELECT {date}, count(*) AS nrows
FROM {db}.{table} 
WHERE {limit}
GROUP BY {date}
"""

## Cell 3: Exception Classes and Data Types

In [None]:
# --- Custom Exceptions ---
class NONEXIST_TABLE(Exception):
    """Exception raised when database view does not exist"""
    pass

class NONEXIST_DATEVAR(Exception):
    """Exception raised when no date-like variable exists"""
    pass

# --- Helper Functions for Configuration Reading ---
def read_str_lst(lst_str, sep='\n'):
    """Parse newline-separated string into list"""
    return [x for x in lst_str.strip().split(sep) if x]

def read_dstr_lst(dct_str, sep='='):
    """Parse key=value pairs into dictionary"""
    d = dict(line.split(sep, 1) for line in read_str_lst(dct_str))
    return {k.strip(): v.strip() for k, v in d.items()}

# --- Base Type Class ---
class BaseType:
    """Base class with logging and nested dataclass support"""
    def __post_init__(self):
        for _field in fields(self):
            if is_dataclass(_field.type):
                field_val = _field.type(**getattr(self, _field.name))
                setattr(self, _field.name, field_val)

    def tolog(self, indent=1, padding=''):
        """Convert dataclass to formatted string for logging"""
        import pprint as pp
        def get_val(x, pad):
            if isinstance(x, BaseType):
                return x.tolog(indent, pad)
            elif isinstance(x, Dict):
                return pp.pformat(x, indent)
            else:
                return repr(x)
        cls_name = self.__class__.__name__
        padding = padding + '\t' * indent
        fields_str = [f'{padding}{k}={get_val(v, padding)}' for k, v in vars(self).items()]
        return f'{cls_name}(\n' + ',\n'.join(fields_str) + '\n)'

# --- Configuration Dataclasses ---
@dataclass
class MetaRange:
    """Range configuration for row selection"""
    start_rows: int | None
    end_rows: int | None

    def __iter__(self):
        yield from [self.start_rows or 1, self.end_rows or float('inf')]

@dataclass
class MetaTable(BaseType):
    """Excel table configuration"""
    file: UPath
    sheet: str
    skip_rows: int
    select_cols: dict
    select_rows: dict

    def __post_init__(self):
        self.select_cols = read_dstr_lst(self.select_cols)
        self.select_rows = read_str_lst(str(self.select_rows))

@dataclass
class MetaInput(BaseType):
    """Input configuration"""
    name: str
    step: str
    env: str
    range: MetaRange
    category: Literal['loan', 'dpst']
    clear_cache: bool = True
    table: MetaTable = None

@dataclass
class MetaCSV:
    """CSV output configuration"""
    file: UPath
    columns: str
    
    def __post_init__(self):
        self.columns = read_str_lst(self.columns)

@dataclass
class S3Config:
    """S3 path configuration"""
    run: UPath
    data: UPath

@dataclass
class LogConfig:
    """Logging configuration"""
    level: Literal['info', 'warning', 'debug', 'error']
    format: str
    file: str
    overwrite: bool

    def todict(self):
        return {
            'level': self.level.upper(),
            'format': self.format,
            'sink': self.file,
            'mode': 'w' if self.overwrite else 'a'
        }

@dataclass
class NextConfig:
    """Next step configuration"""
    file: UPath
    fields: str
    
    def __post_init__(self):
        self.fields = read_dstr_lst(self.fields)

@dataclass
class CacheConfig:
    """Cache configuration (not used in this notebook)"""
    enable: bool
    directory: UPath
    expire_hours: int = None
    force_restart: bool = False
    verbose: bool = False

@dataclass
class MetaOutput(BaseType):
    """Output configuration"""
    folder: UPath
    to_pkl: UPath
    csv: MetaCSV
    to_s3: S3Config
    log: LogConfig
    next: NextConfig
    cache: CacheConfig

@dataclass
class MetaMatch:
    """Column matching configuration"""
    candidates: str
    drop_cols: dict
    add_cols: dict
    
    def __post_init__(self):
        self.candidates = read_str_lst(self.candidates)
        self.drop_cols = list(self.drop_cols)
        self.add_cols = list(self.add_cols)

@dataclass
class ColumnMap(BaseType):
    """Column mapping configuration"""
    output: UPath
    input: UPath
    na_str: str
    overwrite: bool
    excludes: list[str]
    pcds_col: str
    aws_col: str
    pcds_view: str
    aws_view: str

    def __post_init__(self):
        def transform(p):
            if isinstance(p, str):
                return ['_'.join(x for x in c.split()) for c in read_str_lst(p)]
            return p
        self.pcds_col = transform(self.pcds_col)
        self.pcds_view = transform(self.pcds_view)
        self.aws_col = transform(self.aws_col)
        self.aws_view = transform(self.aws_view)
        if '_+_' in self.aws_view[0]:
            self.aws_view = '.'.join('{%s}' % x.lower() for x in self.aws_view[0].split('_+_'))
        self.excludes = [x] if isinstance(self.excludes, str) else list(self.excludes) if self.excludes else []

@dataclass
class MetaConfig(BaseType):
    """Main configuration class"""
    input: MetaInput
    output: MetaOutput
    match: MetaMatch
    column_maps: ColumnMap

@dataclass
class MetaRecord:
    """Record tracking during processing"""
    next_d: dict = field(default_factory=dict)
    col_maps: dict = field(default_factory=dict)
    pull_status: PullStatus = None

@dataclass
class MetaMerge:
    """Results from merging PCDS and AWS column metadata"""
    unique_pcds: list
    unique_aws: list
    col_mapping: pd.DataFrame
    mismatches: str
    uncaptured: str

## Cell 4: Configuration Reading Functions

In [None]:
#--- Patch confection library to preserve case sensitivity ---#
def patch_confection():
    def get_configparser(interpolate: bool = True):
        from confection import CustomInterpolation
        config = ConfigParser(
            interpolation=CustomInterpolation() if interpolate else None,
            allow_no_value=True,
        )
        config.optionxform = str
        return config
    mock_obj = mock.patch('confection.get_configparser', wraps=get_configparser)
    if not hasattr(mock_obj, 'is_local'):
        mock_obj.start()

#--- Read configuration file and create config object ---#
def read_config(config_class: BaseType, config_path: None | UPath | str = None, overrides={}):
    patch_confection()
    if UPath(config_path).is_file():
        config = Config().from_disk(config_path, overrides=overrides)
    else:
        config = Config().from_str(config_path, overrides=overrides)
    return config_class(**{**config.pop('root', {}), **config})

## Cell 5: Utility Classes and Functions

In [None]:
#--- Start logging session with separator ---#
def start_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- End logging session with separator ---#
def end_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- Load environment variables from file ---#
def load_env(file):
    inWindows and load_dotenv(file)

#--- Renew AWS credentials from temporary session ---#
def aws_creds_renew(seconds=0, delta=0, force=False, msg='AWS Credential Has Been Updated !'):
    # Placeholder - implement based on your credential management
    # This would typically fetch temporary credentials from AWS STS
    pass

class IO:
    """File I/O utility class"""
    
    @staticmethod
    def write_json(file, data, cls=None):
        """Write dictionary to JSON file"""
        with open(file, 'w') as f:
            json.dump(data, f, indent=2, cls=cls)

    @staticmethod
    def write_pickle(file, data):
        """Pickle data to file"""
        with open(file, 'wb') as f:
            pickle.dump(data, f)

    @staticmethod
    def read_json(file):
        """Read JSON file into dictionary"""
        with open(file, 'r') as fp:
            data = json.load(fp)
        return data

    @staticmethod
    def read_pickle(file):
        """Unpickle data from file"""
        with open(file, 'rb') as fp:
            data = pickle.load(fp)
        return data

    @staticmethod
    def delete_file(file):
        """Delete file if it exists"""
        if (filepath := UPath(file)).exists():
            filepath.unlink()

class UDict(dict):
    """Case-insensitive dictionary for flexible key matching"""
    
    def __getitem__(self, key):
        return super().__getitem__(self._match(key))
    
    def __contains__(self, key):
        try:
            self._match(key)
            return True
        except KeyError:
            return False

    def _match(self, key):
        """Find matching key regardless of case"""
        for k in self:
            if k.lower() == key.lower():
                return k
        raise KeyError(key)

    def update(self, other=None, **kwargs):
        if other is not None:
            for k, v in other.items() if isinstance(other, abc.Mapping) else other:
                self[k] = v
        for k, v in kwargs.items():
            assert self._match(k)
            self[k] = v

    def get(self, key, default_value=None):
        try:
            return self[key]
        except KeyError:
            return default_value

class Misc:
    """Miscellaneous utility functions"""
    
    @staticmethod
    def remove_items(input_str, delete_lst):
        """Remove specific items from semicolon-separated string"""
        pattern = '|'.join(r'\b%s\b;?\s?' % x for x in delete_lst)
        return re.sub(pattern, '', input_str).rstrip('; ')

    @staticmethod
    def prefix(a, b):
        """Check if either string is prefix of the other"""
        return a.startswith(b) or b.startswith(a)

    @staticmethod
    def common(a, b, use_prefix=False):
        """Find common items between two lists with optional prefix matching"""
        def prefix_cmp(a, b):
            return a.startswith(b) or b.startswith(a)
        
        result, visited = {}, set()
        prefix_d = defaultdict(list)
        
        #>>> Build prefix matching dictionary <<<#
        for x, y in [(x, y) for x in a for y in b]:
            if prefix_cmp(x, y):
                prefix_d[x].append(y)
        
        #>>> Prioritize exact matches <<<#
        for x in a:
            if x in b and x not in visited:
                result[x] = x
                visited.add(x)
        
        #>>> Handle prefix matches for remaining items <<<#
        for x in a:
            if x in result and (not use_prefix):
                continue
            for y in prefix_d[x]:
                if y not in visited:
                    result[x] = y
                    visited.add(y)
        return result

    @staticmethod
    def convert2int(a):
        """Safely convert value to integer"""
        try:
            return int(a)
        except (TypeError, ValueError):
            return None

    @staticmethod
    def convert2datestr(a):
        """Convert datetime to string format"""
        if isinstance(a, datetime):
            return a.strftime('%Y-%m-%d')
        return a

## Cell 6: Database Connection and SQL Engine

In [None]:
#--- Connect to PCDS Oracle database ---#
def pcds_connect(service_name, ldap_service='X'):
    """Establish connection to PCDS Oracle database"""
    import oracledb
    # Map service names to connection strings
    svc2server = {
        'A': 'PBCS21P',
        'B': 'PBCS30P',
        'C': 'PCDS',
        'D': 'PBCS23P',
    }
    # Implement connection logic based on your environment
    # return oracledb.connect(user=usr, password=pwd, dsn=dns_tns)
    raise NotImplementedError("Please implement PCDS connection logic")

class SQLengine:
    """SQL query engine for PCDS and AWS databases"""
    
    def __init__(self, platform: Literal['PCDS', 'AWS']):
        self._platform = platform
        self.reset()

    def reset(self):
        """Reset internal state"""
        self._where = None
        self._type = None
        self._date = None
        self._dateraw = None
        self._table = None
        self._format = AWS_DT_FORMAT if self._platform == 'AWS' else PCDS_DT_FORMAT

    def extract_var(self, stmt):
        """Extract variable names from SQL date expression"""
        def _extract_var():
            word, time, tagt = r'\w+_\w+', r"'[^']*'", r'[^,]+'
            pattern1 = fr"{word}\({word}\(({tagt}),\s*{time}\),\s*{time}\)"
            pattern2 = fr"{word}\(({tagt}),\s*{time}\)"
            if (m := re.match(pattern1, stmt)):
                return stmt, m.group(1)
            elif (m := re.match(pattern2, stmt)):
                return stmt, m.group(1)
            return stmt, stmt
        
        date_var, date_raw = _extract_var()
        if self._platform == 'PCDS':
            return date_var, date_raw.upper()
        else:
            return date_var, date_raw.lower()

    def query(self, query, connection, **query_kwargs):
        """Execute SQL query and return DataFrame"""
        query = self.clean_query(query)
        df = psql.read_sql_query(query, connection, **query_kwargs)
        
        #>>> Normalize column names based on platform <<<#
        if self._platform == 'PCDS':
            df.columns = [x.upper() for x in df.columns]
        else:
            df.columns = [x.lower() for x in df.columns]
        return df

    def clean_query(self, query: str):
        """Clean and prepare SQL query for execution"""
        #>>> Extract table name from query <<<#
        table_pattern = r'([\w.]+)\s+MORF\b'
        self._table = re.search(table_pattern, query[::-1], flags=re.I).group(1)[::-1]
        
        #>>> Add alias to date column if needed <<<#
        date_pattern = r'(?!\\s+(?:AS\s+)\w+)'
        if self._date and (match := re.search(
            re.escape(self._date) + date_pattern,
            re.split(r'\b(?:FROM|WHERE)\b', query, flags=re.I)[0],
            flags=re.I
        )):
            st, ed = match.span()
            query = query[:st] + f'{self._date} as {self._dateraw}' + query[ed:]
        
        #>>> Remove empty WHERE clauses <<<#
        where_pattern = r'^\s*where\s*$'
        return re.sub(where_pattern, '', query, flags=re.I | re.M)

    def get_where_sql(self, date_var: str, date_type: str, start_dt=None, end_dt=None, where_cstr='') -> str:
        """Build WHERE clause for date filtering"""
        self._type = date_type
        
        #>>> Handle subquery in where constraint <<<#
        if not pd.isna(where_cstr) and (m := re.search(r'(?<=\()select.*(?=\))', where_cstr)):
            call_func = getattr(self, 'query_%s' % self._platform)
            rhs = call_func(m.group()).iloc[0, 0]
            if isinstance(rhs, str):
                where_cstr = "%s '%s'" % (where_cstr[:m.start() - 1], rhs)
            else:
                where_cstr = "%s '%s'" % (where_cstr[:m.start() - 1], rhs.strftime('%Y-%m-%d'))
        
        where_sql = [where_cstr]
        self.get_date_sql(date_var, date_type)
        
        #>>> Add date range filters <<<#
        if not pd.isna(start_dt):
            start_dt = Misc.convert2datestr(start_dt)
            where_sql.append(f"{self._date} >= '{start_dt}'")
        if not pd.isna(end_dt):
            end_dt = Misc.convert2datestr(end_dt)
            where_sql.append(f"{self._date} <= '{end_dt}'")
        
        #>>> Convert TO_CHAR comparisons to TO_DATE for PCDS <<<#
        for i, sql_stmt in enumerate(where_sql):
            if self._platform == 'AWS' or pd.isna(sql_stmt):
                continue
            if (match := re.match(r"^TO_CHAR\(([^,]+),\s*'(.*)'\)\s*([><=!]+)\s*'([^']+)'", sql_stmt)):
                a, b, c, d = match.groups()
                where_sql[i] = f"{a} {c} TO_DATE('{d}', '{b}')"
        
        self._where = ' AND '.join(x for x in where_sql if not pd.isna(x))

    @staticmethod
    def get_date_format(date_var):
        """Extract date format from variable specification"""
        pattern = r'^(.+?)(?:\s*\(([^)]+)\))?$'
        date_var, date_format = re.match(pattern, date_var).groups()
        return date_var, date_format

    def get_date_sql(self, date_var: str, date_type: str):
        """Convert date column to standard format in SQL"""
        date_var, date_format = self.get_date_format(date_var)
        is_date = re.search(r'time|date', date_type, re.IGNORECASE)
        
        #>>> Parse string dates if format provided <<<#
        if date_format and (not is_date):
            if self._platform == 'PCDS':
                date_var = f"TO_DATE({date_var}, '{date_format}')"
            else:
                date_var = f"DATE_PARSE({date_var}, '{date_format}')"
            is_date = True
        
        #>>> Convert to standard string format <<<#
        if is_date:
            if self._platform == 'PCDS':
                date_var = f"TO_CHAR({date_var}, 'YYYY-MM-DD')"
            else:
                date_var = f"DATE_FORMAT({date_var}, '%Y-%m-%d')"
        
        self._date, self._dateraw = self.extract_var(date_var)

    def __repr__(self):
        return f'SQL({self._platform})\n' \
               f'   table: {self._table}\n' \
               f'   where: {self._where}\n' \
               f'   date : {self._date} ({self._dateraw})'

    def query_PCDS(self, query_stmt: str, service_name: str, **query_kwargs):
        """Execute query on PCDS database"""
        with pcds_connect(service_name=service_name) as CONN:
            return self.query(query_stmt, CONN, **query_kwargs)

    def query_AWS(self, query_stmt: str, **query_kwargs):
        """Execute query on AWS Athena"""
        aws_creds_renew()
        CONN = pa.connect(
            s3_staging_dir="s3://355538383407-us-east-1-athena-output/uscb-analytics/",
            region_name="us-east-1",
        )
        return self.query(query_stmt, CONN, **query_kwargs)

## Cell 7: Excel Input Processing and Column Mapping

In [None]:
#--- Read and process Excel input file with table configurations ---#
def read_excel_input(config: MetaTable) -> pd.DataFrame:
    def trim_me(x):
        """Trim whitespace from strings"""
        return x.strip() if isinstance(x, str) else x

    def extract_name(name):
        """Remove parenthetical notes from names"""
        if pd.isna(name):
            return pd.NA
        if not isinstance(name, str):
            return name
        remove_extra = r'\(.*\)'
        return re.sub(remove_extra, '', name).strip()

    def merge_pcds_svc_tbl(df):
        """Combine service and table names into qualified names"""
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning)
            cols = [x for x in df.columns if x not in (
                'group', 'pcds_dt', 'aws_dt', 'pcds_where', 'aws_where'
            )]
            df[cols] = df[cols].map(extract_name)
            tbl = df.pop('pcds_tbl')
            df['col_map'] = df['col_map'].fillna(tbl).copy()
            svc = df.pop('pcds_svc').fillna('no_server_provided')
            df.loc[:, ['pcds_tbl']] = svc + '.' + tbl.str.lower()
            df['pcds_dt'] = df['pcds_dt'].copy().fillna(NO_DATE)
            df['aws_dt'] = df['aws_dt'].copy().fillna(NO_DATE)
            df[['pcds_where', 'aws_where']] = df[['pcds_where', 'aws_where']].replace(np.nan, None)

    file_path = config.file
    try:
        df = pd.read_excel(
            file_path, sheet_name=config.sheet,
            skiprows=config.skip_rows, usecols=list(config.select_cols)
        )
        df = df.rename(columns=config.select_cols).map(trim_me)
        if len(config.select_rows) > 0:
            df = df.query(' & '.join(config.select_rows))
        merge_pcds_svc_tbl(df)
        logger.info(f"Read {len(df)} rows from {file_path}")
        return df
    except Exception as e:
        logger.error(f"Failed to read Excel file {file_path}: {e}")
        raise

class ColmapUtils:
    """Utility class for processing column mapping files"""
    
    def __init__(self, category: Literal['loan', 'dpst']):
        self.category = category
        if category == 'dpst':
            self._obtain_table = self.process_dpst
        elif category == 'loan':
            self._obtain_table = self.process_loan

    def is_column_tokenized(self, row):
        """Check if column contains PII that should be tokenized"""
        if row.get('pii_encryption', None) == 'Y':
            return True
        try:
            return 'tokenise' in row.get('note', "").lower()
        except (TypeError, AttributeError):
            return False

    def process_loan(self, config: ColumnMap):
        """Process loan category mapping file"""
        all_sheets = pd.read_excel(config.input, sheet_name=None)
        yield from all_sheets.items()

    def process_dpst(self, config: ColumnMap):
        """Process deposit category mapping file"""
        all_sheets = pd.read_excel(config.input, sheet_name='Column_Details')
        return all_sheets.groupby(config.pcds_view[0])

    def process(self, config: ColumnMap) -> UDict:
        """Process column mapping configuration file"""
        if os.path.exists(config.output) and (not config.overwrite):
            return IO.read_json(config.output)
        
        table_excludes = config.excludes or []
        mappings = {}
        
        #>>> Process each table's column mappings <<<#
        for pcds_name, df in self._obtain_table(config):
            if pcds_name in table_excludes:
                continue
            
            if self.category == 'loan' and 'Source' in df.columns:
                df = pd.DataFrame(df.iloc[1:].values, columns=df.iloc[0])

            df = df.rename(columns=self.colfunc)
            pcds2aws, aws2pcds = {}, {}
            s = {
                'aws_unique': [],
                'pcds_unique': [],
                'duplicated_pcds': set(),
                'duplicated_aws': set(),
                'pii_cols': set()
            }
            
            #>>> Build bidirectional column mappings <<<#
            for i, row in enumerate(df.itertuples()):
                row = UDict(**row._asdict())
                if i == 0:
                    if self.category == 'loan':
                        pcds_name = self.fetchcol(row, config.pcds_view, config.na_str)
                        aws_name = self.fetchcol(row, config.aws_view, config.na_str)
                    elif self.category == 'dpst':
                        aws_name = config.aws_view.format(**row)
                
                pcds_col = self.fetchcol(row, config.pcds_col, config.na_str)
                aws_col = self.fetchcol(row, config.aws_col, config.na_str)
                pcds_na, aws_na = pd.isna(pcds_col), pd.isna(aws_col)
                
                if pcds_na and aws_na:
                    continue
                elif pcds_na:
                    s['aws_unique'].append(aws_col.lower())
                elif aws_na:
                    s['pcds_unique'].append(pcds_col.upper())
                else:
                    pcds_col, aws_col = pcds_col.upper(), aws_col.lower()
                    
                    #>>> Check for duplicate AWS columns <<<#
                    has_dupl, aws_dup = self.get_duplicates(aws_col, pcds_col, aws2pcds, s['aws_unique'])
                    if has_dupl:
                        logger.warning(f'Table {pcds_name} has duplicated AWS column {aws_col}')
                        s['duplicated_aws'] |= aws_dup
                    else:
                        aws2pcds[aws_col] = pcds_col

                    #>>> Check for duplicate PCDS columns <<<#
                    has_dupl, pcds_dup = self.get_duplicates(pcds_col, aws_col, pcds2aws, s['pcds_unique'])
                    if has_dupl:
                        logger.warning(f'Table {pcds_name} has duplicated PCDS column {pcds_col}')
                        s['duplicated_pcds'] |= pcds_dup
                    else:
                        pcds2aws[pcds_col] = aws_col
                
                if self.is_column_tokenized(row):
                    s['pii_cols'].add(pcds_col)
            
            if len(pcds2aws) == 0:
                logger.info(f"No match key is found in {pcds_name}")
            
            s['duplicated_pcds'] = list(s['duplicated_pcds'])
            s['duplicated_aws'] = list(s['duplicated_aws'])
            s['pii_cols'] = list(x for x in s['pii_cols'] if pd.notna(x))
            mappings[pcds_name] = {
                'pcds_table': pcds_name,
                'aws_table': aws_name,
                'pcds2aws': pcds2aws,
                **s
            }
        
        IO.write_json(config.output, mappings)
        return UDict(mappings)

    @staticmethod
    def get_duplicates(col_a, col_b, a2b: dict, unique_a: list):
        """Check for duplicate column mappings"""
        exist_key = set(list(a2b) + unique_a)
        exist_val = a2b.get(col_a, pd.NA)
        if col_a in exist_key and col_b != exist_val:
            return True, {f'{col_a}:{exist_val}', f'{col_a}:{col_b}'}
        return False, None

    @staticmethod
    def colfunc(col):
        """Normalize column names"""
        if pd.isna(col):
            return 'comment'
        col = col.split('\n')[-1]
        return '_'.join(x.lower() for x in col.split())

    @staticmethod
    def fetchcol(row, names, na_str):
        """Fetch first non-null column value from list of column names"""
        for name in names:
            name = name.lower()
            if (name in row) and (not pd.isna(row[name])) and (row[name] != na_str):
                return row[name].strip()
        return pd.NA

## Cell 8: S3 Utilities

In [None]:
class S3:
    """AWS S3 utility functions"""
    
    @staticmethod
    def upload_multiple(s3_url, folder, prefix=''):
        """Upload multiple files from folder to S3"""
        folder, s3_url = UPath(folder), UPath(s3_url)
        for file in folder.glob('%s.*' % prefix):
            aws.s3.upload(
                local_file=file.as_posix(),
                path=s3_url.joinpath(file.name).as_posix(),
                boto3_session=SESSION
            )
            logger.info(f"Uploading {file.name} to {s3_url} [finished]")

## Cell 9: Main Processing Functions

In [None]:
# Initialize global objects
proc_pcds = SQLengine('PCDS')
proc_aws = SQLengine('AWS')
record = MetaRecord()

#--- Parse command line arguments and load configuration ---#
def parse_config():
    parser = argparse.ArgumentParser(description='Conduct Meta Info Analysis')
    parser.add_argument(
        '--category',
        choices=['loan', 'dpst'],
        default='dpst',
        help='which meta template to use',
    )
    parser.add_argument(
        '--name', type=str,
        default='test_0827',
        help='how to name this analysis (override)'
    )
    parser.add_argument(
        '--query', type=str,
        default='group == "test_0827"',
        help='how to name this analysis (override)'
    )
    args = parser.parse_args()

    if args.category == 'dpst':
        config_path = r'files/inputs/config_meta_dpst.cfg'
    elif args.category == 'loan':
        config_path = r'files/inputs/config_meta_loan.cfg'
    
    config = read_config(
        MetaConfig,
        config_path=config_path,
        overrides={
            'input.table.select_rows': args.query,
            'input.name': args.name
        }
    )
    (out_folder := UPath(config.output.folder)).mkdir(exist_ok=True)
    shutil.copy(config_path, out_folder.joinpath(f'{config.input.step}.cfg'))
    return config

#--- Normalize date format specifications ---#
def parse_date_format(row):
    return row._replace(
        pcds_dt=row.pcds_dt.upper(),
        aws_dt=row.aws_dt.lower(),
    )

#--- Process PCDS table metadata (columns and row count) ---#
def process_pcds_meta(row, rename_columns={}):
    service, table = (info_str := row.pcds_tbl).split('.', maxsplit=1)
    logger.info(f"\tStart processing {info_str}")
    
    #>>> Query column metadata and row counts <<<#
    try:
        with pcds_connect(service) as CONN:
            df_type = proc_pcds.query(PCDS_SQL_META.format(table=table), CONN)
            if hasattr(df_type, 'last_modified'):
                record.next_d.update(last_modified=df_type.last_modified)
            
            #>>> Extract date variable and build WHERE clause <<<#
            date_var = re.match(r'(\w+)(?=\s*\()?', row.pcds_dt).group(1)
            if date_var == NO_DATE:
                proc_pcds._where = row.pcds_where
            else:
                proc_pcds.get_where_sql(
                    date_var=row.pcds_dt,
                    date_type=df_type.query(f"COLUMN_NAME == '{date_var.upper()}'")['DATA_TYPE'].item(),
                    start_dt=row.start_dt,
                    end_dt=row.end_dt,
                    where_cstr=row.pcds_where
                )
            
            nrow_sql = PCDS_SQL_NROW.format(table=table, limit=proc_pcds._where)
            df_nrow = proc_pcds.query(nrow_sql, CONN)
    except (pd.errors.DatabaseError, ValueError):
        logger.warning(f"Couldn't find {table.upper()} in {service.upper()}")
        raise NONEXIST_TABLE("PCDS View Not Existing")
    
    df_type.columns = [x.lower() for x in df_type.columns]
    df_type['aws_colname'] = df_type['column_name'].map(rename_columns)
    return {'column': df_type, 'row': df_nrow}, len(rename_columns) > 0

#--- Query PCDS table for date-wise row counts ---#
def process_pcds_date(row):
    service, table = (info_str := row.pcds_tbl).split('.', maxsplit=1)
    
    try:
        with pcds_connect(service) as CONN:
            date_sql = PCDS_SQL_DATE.format(
                table=table, limit=proc_pcds._where, date=proc_pcds._date
            )
            df_meta = proc_pcds.query(date_sql, CONN)
        logger.info(f"\tFinish Processing {info_str}")
    except pd.errors.DatabaseError:
        if proc_pcds._dateraw:
            logger.warning(f"Column {proc_pcds._dateraw.upper()} not found in {table.upper()}")
        raise NONEXIST_DATEVAR("Date-like Variable Not In PCDS")
    return df_meta

#--- Process AWS table metadata (columns and row count) ---#
def process_aws_meta(row):
    database, table = (info_str := row.aws_tbl).split('.', maxsplit=1)
    CONN = pa.connect(
        s3_staging_dir="s3://355538383407-us-east-1-athena-output/uscb-analytics/",
        region_name="us-east-1",
    )
    logger.info(f"\tStart processing {info_str}")
    
    #>>> Query column metadata and row counts <<<#
    try:
        df_type = proc_aws.query(AWS_SQL_META.format(table=table, db=database), CONN)
        date_var = re.match(r'(\w+)(?=\s*\()?', row.aws_dt).group(1)
        if date_var == NO_DATE:
            proc_aws._where = row.aws_where
        else:
            proc_aws.get_where_sql(
                date_var=row.aws_dt,
                date_type=df_type.query(f"column_name == '{date_var.lower()}'")['data_type'].item(),
                start_dt=row.start_dt,
                end_dt=row.end_dt,
                where_cstr=row.aws_where
            )
        nrow_sql = AWS_SQL_NROW.format(table=table, db=database, limit=proc_aws._where)
        df_nrow = proc_aws.query(nrow_sql, CONN)
    except pd.errors.DatabaseError:
        logger.warning(f"Couldn't find {table.lower()} in {database.lower()}")
        raise NONEXIST_TABLE("AWS View Not Existing")
    
    df_type.columns = [x.lower() for x in df_type.columns]
    return {'column': df_type, 'row': df_nrow}

#--- Query AWS table for date-wise row counts ---#
def process_aws_date(row):
    database, table = (info_str := row.aws_tbl).split('.', maxsplit=1)
    CONN = pa.connect(
        s3_staging_dir="s3://355538383407-us-east-1-athena-output/uscb-analytics/",
        region_name="us-east-1",
    )
    try:
        date_sql = AWS_SQL_DATE.format(
            table=table, limit=proc_aws._where, date=proc_aws._date, db=database
        )
        df_meta = proc_aws.query(date_sql, CONN)
        logger.info(f"\tFinish Processing {info_str}")
    except pd.errors.DatabaseError:
        if proc_aws._dateraw:
            logger.warning(f"Column {proc_aws._dateraw.upper()} not found in {table.upper()}")
        raise NONEXIST_DATEVAR("Date-like Variable Not In AWS")
    
    df_meta.columns = [x.lower() for x in df_meta.columns]
    return df_meta

#--- Initialize output folders and logging ---#
def start_setup(start_row, C_out):
    try:
        assert start_row <= 1
        os.remove(C_out.csv.file)
    except (TypeError, AssertionError, FileNotFoundError):
        pass
    os.makedirs(C_out.folder, exist_ok=True)
    logger.add(**C_out.log.todict())

## Cell 10: Data Type Mapping and Column Comparison

In [None]:
#--- Check if PCDS and AWS data types are compatible ---#
def map_pcds_aws(row):
    aws_dtype = row.data_type_aws
    match (pcds_dtype := row.data_type_pcds):
        case 'NUMBER':
            ok_1 = aws_dtype == 'double'
            return ok_1
        case _ if pcds_dtype.startswith('NUMBER'):
            y1 = re.match(r'NUMBER\(\d*,(\d+)\)', pcds_dtype).group(1)
            match = re.match(r'decimal\(\d*,(\d+)\)', aws_dtype)
            return bool(match and match.group(1) == y1)
        case _ if pcds_dtype.startswith('VARCHAR2'):
            return pcds_dtype.replace('VARCHAR2', 'varchar') == aws_dtype
        case _ if pcds_dtype.startswith('CHAR'):
            n = re.match(r'CHAR\((\d+)\)', pcds_dtype).group(1)
            return not (aws_dtype.startswith('VARCHAR') and n != 1)
        case 'DATE':
            ok_1 = aws_dtype == 'date'
            ok_2 = aws_dtype.startswith('timestamp')
            return ok_1 | ok_2
        case _ if pcds_dtype.startswith('TIMESTAMP'):
            return aws_dtype.startswith('timestamp')
        case _:
            s = ">>> Mismatched type on {}\n\tPCDS ({}) ==> AWS ({})"
            logger.info(s.format(row.column_name_aws, pcds_dtype, aws_dtype))
            return False

#--- Merge and compare PCDS and AWS column metadata ---#
def process_merge(pcds: pd.DataFrame, aws: pd.DataFrame) -> MetaMerge:
    """
    Check column mapping and variable typing differences
    Returns unique columns, type mismatches, and uncaptured mappings
    """
    #>>> Find columns without documented mappings <<<#
    unmapped_pcds = (
        pcds.query('aws_colname != aws_colname')
        ['column_name'].str.lower().to_list()
    )
    unmapped_aws = (
        aws.query('~column_name.isin(@pcds.aws_colname)')
        ['column_name'].to_list()
    )
    
    #>>> Use substring matching to find undocumented pairs <<<#
    map_uncaptured = Misc.common(unmapped_pcds, unmapped_aws)
    map_uncaptured = {
        k.upper(): v for k, v in map_uncaptured.items()
        if k not in record.next_d['tokenised_cols']
    }
    uncaptured = SEP.join('{}->{}'.format(k, v) for k, v in map_uncaptured.items())

    #>>> Update column mappings with discovered pairs <<<#
    pcds['aws_colname'] = (
        pcds['aws_colname']
        .combine_first(pcds['column_name'].map(map_uncaptured))
    )
    
    #>>> Merge PCDS and AWS metadata <<<#
    df_match = pd.merge(
        pcds, aws,
        left_on='aws_colname', right_on='column_name',
        suffixes=['_pcds', '_aws'],
        how='outer', indicator=True
    )
    
    #>>> Separate unique columns from each platform <<<#
    pcds_cols = ['column_name_pcds', 'data_type_pcds']
    pcds_unique = df_match.query('_merge == "left_only"')[pcds_cols]
    aws_cols = ['column_name_aws', 'data_type_aws']
    aws_unique = df_match.query('_merge == "right_only"')[aws_cols]

    #>>> Check data type compatibility for matched columns <<<#
    if record.pull_status == PullStatus.NO_MAPPING:
        mismatched, merged = '', None
    else:
        merged = (
            df_match.query('_merge == "both"')
            .drop(columns=['aws_colname', '_merge'])
        )
        merged['type_match'] = merged.apply(map_pcds_aws, axis=1)
        mismatch_d = (
            merged.query('~type_match')
            [['data_type_pcds', 'data_type_aws']]
            .drop_duplicates()
        )
        mismatched = SEP.join('{}->{}'.format(*x[1:]) for x in mismatch_d.itertuples())

    #>>> Filter out previously known unique columns <<<#
    unmapped_pcds = pcds_unique['column_name_pcds'].str.upper().to_list()
    unmapped_aws = aws_unique['column_name_aws'].str.lower().to_list()
    if (col_map := record.col_maps.get(proc_pcds._table)):
        unmapped_aws = [x for x in unmapped_aws if x not in col_map['aws_unique']]
        unmapped_pcds = [x for x in unmapped_pcds if x not in col_map['pcds_unique']]

    return MetaMerge(
        unique_pcds=unmapped_pcds,
        unique_aws=unmapped_aws,
        col_mapping=merged,
        mismatches=mismatched,
        uncaptured=uncaptured
    )

#--- Compare column mappings and total records between PCDS and AWS ---#
def process_meta(pcds_t: dict, aws_t: dict) -> dict:
    uncaptured = ""
    next_d = record.next_d
    pcds_c, aws_c = pcds_t['column'], aws_t['column']
    
    #>>> Handle missing column mapping <<<#
    if pcds_c['aws_colname'].isna().all():
        pcds_c['aws_colname'] = pcds_c['column_name'].str.lower()
        uncaptured = "Column Mapping Not Provided"
    
    profile = process_merge(pcds_c, aws_c)
    logger.info(">>> Finish Merging Type Data")

    #>>> Prepare data for next processing step <<<#
    d = (
        profile.col_mapping
        .drop(columns='type_match')
        .apply(lambda x: SEP.join(x.tolist()), axis=0)
        .to_dict()
    )
    next_d.update(
        pcds_cols=d['column_name_pcds'],
        pcds_types=d['data_type_pcds'],
        pcds_nrows=int(pcds_t['row'].iloc[0, 0]),
        pcds_where=proc_pcds._where,
        aws_cols=d['column_name_aws'],
        aws_types=d['data_type_aws'],
        aws_nrows=int(aws_t['row'].iloc[0, 0]),
        aws_where=proc_aws._where,
        pcds_dt_type=proc_pcds._type,
        aws_dt_type=proc_aws._type
    )
    
    return {
        'Row UnMatch': next_d['pcds_nrows'] != next_d['aws_nrows'],
        'Row UnMatch Details': f"PCDS({next_d['pcds_nrows']}) : AWS({next_d['aws_nrows']})",
        'Type UnMatch Details': profile.mismatches,
        'Column Type UnMatch': len(profile.mismatches) > 0,
        'PCDS Extra Columns': len(profile.unique_pcds) > 0,
        'PCDS Unique Columns': SEP.join(profile.unique_pcds),
        'AWS Extra Columns': len(profile.unique_aws) > 0,
        'AWS Unique Columns': SEP.join(profile.unique_aws),
        'Uncaptured Column Mappings': uncaptured or profile.uncaptured,
    }

#--- Identify specific dates with row count discrepancies ---#
def process_date(cnt_pcds: pd.DataFrame, cnt_aws: pd.DataFrame):
    def get_date(a, b):
        """Return first non-null date"""
        return b if pd.isna(a) else a

    def get_detailed_mismatch():
        """Format mismatch details"""
        a, _, b, _ = time_mismatch.columns
        return '; '.join(
            f"{get_date(r[a], r[b])} ({r['NROWS']} : {Misc.convert2int(r['nrows'])})"
            for _, r in time_mismatch.iterrows()
        )

    def get_time_excludes_sql():
        """Build SQL to exclude problematic dates"""
        exclude = ','.join("'%s'" % x for x in time_mismatch[pcds_dt].fillna(time_mismatch[aws_dt]) if x)
        return {
            'pcds_exclude': f'{proc_pcds._date} not in ({exclude})',
            'aws_exclude': f'{proc_aws._date} not in ({exclude})',
        }

    pcds_dt, aws_dt = proc_pcds._dateraw, proc_aws._dateraw
    
    #>>> Merge date-wise row counts <<<#
    df_all = pd.merge(
        left=cnt_pcds,
        right=cnt_aws,
        left_on=pcds_dt,
        right_on=aws_dt,
        suffixes=['_pcds', '_aws'],
        how='outer'
    )
    
    time_mismatch = df_all.query('NROWS != nrows')
    logger.warning("Out of {} days to compare, issues are found on {} days".format(
        len(cnt_aws), len(time_mismatch)
    ))
    record.next_d.update(**get_time_excludes_sql())
    
    return {
        'Time Span UnMatch': len(time_mismatch) > 0,
        'Time Span Variable': f'{pcds_dt} : {aws_dt}',
        'Time UnMatch Details (PCDS : AWS)': get_detailed_mismatch()
    }

## Cell 11: Main Execution Logic

In [None]:
def main():
    """Main execution function for meta analysis"""
    config = parse_config()
    df_dict, df_next = {}, {}
    C_out, C_in = config.output, config.input
    start_row, end_row = C_in.range
    start_setup(start_row, C_out)
    logger.info('Configuration:\n' + config.tolog())
    
    #>>> Load environment and AWS credentials <<<#
    load_env(C_in.env)
    start_run()
    aws_creds_renew(15 * 60)
    
    HAS_HEADER = False
    
    #>>> Load input table list and column mappings <<<#
    tbl_list = (
        read_excel_input(C_in.table)
        .groupby('pcds_tbl')
        .first()
        .reset_index()
    )
    record.col_maps = (
        ColmapUtils(C_in.category)
        .process(config.column_maps)
    )
    
    #>>> Process each table <<<#
    total = len(tbl_list)
    for i, row in enumerate(tqdm(
        tbl_list.itertuples(), desc='Processing ...', total=total
    ), start=1):
        if (i < start_row or i > end_row):
            HAS_HEADER = False
            continue
        
        pcds_m, pcds_d, aws_m, aws_d = {}, None, {}, None
        record.next_d = UDict(C_out.next.fields)
        name: str = row.pcds_tbl.split('.')[1].lower()
        
        logger.info(f">>> Start {name}")
        
        #>>> Initialize record for this table <<<#
        record.next_d.update(
            pcds_tbl=row.pcds_tbl,
            aws_tbl=row.aws_tbl,
            pcds_dt=row.pcds_dt,
            aws_dt=row.aws_dt,
            partition=row.partition,
            last_modified=datetime.now().strftime('%Y-%m-%d'),
            tokenised_cols=[]
        )
        
        os.environ['SKIP_SAVE'] = 'N'
        if os.environ.get('SKIP_SNAPSHOT') and row.partition == 'snapshot':
            os.environ['SKIP_SAVE'] = 'Y'
        
        row_result_d = {
            'Consumer Loans Data Product': row.group,
            'PCDS Table Details with DB Name': name,
            'Tables delivered in AWS with DB Name': row.aws_tbl
        }
        pull_status = PullStatus.SUCCESS
        
        #>>> Try to pull PCDS table metadata <<<#
        try:
            rename_columns = {}
            if (c := row.col_map) and (c in record.col_maps):
                meta_info = record.col_maps[c]
                rename_columns = {
                    k: v for k, v in meta_info['pcds2aws'].items()
                    if k not in meta_info['pii_cols']
                }
                record.next_d.update(tokenised_cols=meta_info['pii_cols'].copy())
            pcds_m, exist_mapping = process_pcds_meta(row, rename_columns)
        except NONEXIST_TABLE:
            pull_status = PullStatus.NONEXIST_PCDS
        
        #>>> Check if column mapping was provided <<<#
        if pull_status == PullStatus.SUCCESS and (not exist_mapping):
            pull_status = PullStatus.NO_MAPPING
        
        #>>> Try to get date-wise counts from PCDS <<<#
        try:
            pcds_d = process_pcds_date(row)
            if len(pcds_m) == 0:
                pull_status = PullStatus.EMPTY_PCDS
        except NONEXIST_DATEVAR:
            if pull_status == PullStatus.SUCCESS:
                pull_status = PullStatus.NONDATE_PCDS
        
        #>>> Try to pull AWS table metadata <<<#
        try:
            aws_m = process_aws_meta(row)
        except NONEXIST_TABLE:
            if pull_status == PullStatus.SUCCESS:
                pull_status = PullStatus.NONEXIST_AWS
        
        #>>> Try to get date-wise counts from AWS <<<#
        try:
            aws_d = process_aws_date(row)
            if len(aws_m) == 0:
                pull_status = PullStatus.EMPTY_AWS
        except NONEXIST_DATEVAR:
            if pull_status == PullStatus.SUCCESS:
                pull_status = PullStatus.NONDATE_AWS
        
        #>>> Initialize result dictionary <<<#
        ROW_D = {
            'Status': pull_status.value,
            'Row UnMatch': False,
            'Row UnMatch Details': '',
            'Time Span UnMatch': False,
            'Time Span Variable': f'{row.pcds_dt} : {row.aws_dt}',
            'Time UnMatch Details (PCDS : AWS)': '',
            'Column Type UnMatch': False,
            'Type UnMatch Details': '',
            'PCDS Extra Columns': False,
            'PCDS Unique Columns': '',
            'AWS Extra Columns': False,
            'AWS Unique Columns': '',
            'Uncaptured Column Mappings': '',
        }
        
        #>>> Handle different status scenarios <<<#
        if pull_status in (PullStatus.NONEXIST_PCDS, PullStatus.NONDATE_AWS):
            pass
        elif pull_status == PullStatus.NO_MAPPING:
            #>>> Basic comparison without column mapping <<<#
            nrow_pcds = int(pcds_m['row'].iloc[0].item())
            nrow_aws = int(aws_m['row'].iloc[0].item())
            
            unmapped_pcds = [x.lower() for x in pcds_m['column'].column_name]
            unmapped_aws = aws_m['column'].column_name.to_list()
            
            pcds2aws = {k.upper(): v for k, v in Misc.common(unmapped_pcds, unmapped_aws).items()}
            aws2pcds = {v: k for k, v in pcds2aws.items()}
            
            pcds_unique = [x.upper() for x in unmapped_pcds if x.upper() not in pcds2aws]
            aws_unique = [x.lower() for x in unmapped_aws if x.lower() not in aws2pcds]
            
            pcds2type = pcds_m['column'].set_index('column_name').to_dict()['data_type']
            aws2type = aws_m['column'].set_index('column_name').to_dict()['data_type']
            
            ROW_D.update(**{
                'Row UnMatch': nrow_pcds != nrow_aws,
                'Row UnMatch Details': f"PCDS({nrow_pcds}) : AWS({nrow_aws})",
                'PCDS Extra Columns': len(pcds_unique) > 0,
                'PCDS Unique Columns': SEP.join(pcds_unique),
                'AWS Extra Columns': len(aws_unique) > 0,
                'AWS Unique Columns': SEP.join(aws_unique),
                'Uncaptured Column Mappings': SEP.join('{}->{}'.format(k, v) for k, v in pcds2aws.items()),
            })
            
            record.next_d.update(
                pcds_cols=SEP.join(pcds2aws),
                pcds_types=SEP.join([pcds2type[x] for x in pcds2aws]),
                pcds_nrows=nrow_pcds,
                pcds_dt_type=proc_pcds._type,
                pcds_where=proc_pcds._where,
                aws_cols=SEP.join(aws2pcds),
                aws_types=SEP.join([aws2type[x] for x in aws2pcds]),
                aws_nrows=nrow_aws,
                aws_dt_type=proc_aws._type,
                aws_where=proc_aws._where,
            )
        else:
            #>>> Full comparison with column mapping <<<#
            meta_result_d = process_meta(pcds_m, aws_m)
            ROW_D.update(**meta_result_d)
        
        #>>> Check for date-wise discrepancies if row counts don't match <<<#
        if (ROW_D['Row UnMatch']) and \
                pull_status not in (PullStatus.NONDATE_AWS, PullStatus.NONDATE_PCDS):
            try:
                date_result_d = process_date(pcds_d, aws_d)
                ROW_D.update(**date_result_d)
            except TypeError as e:
                logger.error(e)
                continue
        
        #>>> Clean up column lists <<<#
        ROW_D.update(**{
            'PCDS Unique Columns': Misc.remove_items(
                ROW_D['PCDS Unique Columns'], config.match.drop_cols
            ),
            'AWS Unique Columns': Misc.remove_items(
                ROW_D['AWS Unique Columns'], config.match.add_cols
            )
        })
        
        #>>> Write results to CSV <<<#
        with open(C_out.csv.file, 'a+', newline='') as fp:
            writer = csv.DictWriter(fp, fieldnames=C_out.csv.columns)
            if not HAS_HEADER:
                writer.writeheader()
                HAS_HEADER = True
            writer.writerow({
                **row_result_d,
                **ROW_D,
            })
        
        df_dict[name] = {
            'pcds_meta': pcds_m,
            'pcds_date': pcds_d,
            'aws_meta': aws_m,
            'aws_date': aws_d,
        }
        df_next[name] = record.next_d.copy()
        IO.write_json(C_out.next.file, df_next)
        
        #>>> Reset engines for next iteration <<<#
        proc_aws.reset()
        proc_pcds.reset()
    
    #>>> Upload results to S3 <<<#
    S3.upload_multiple(
        s3_url=UPath(C_out.to_s3.run, C_in.name),
        folder=C_out.folder,
        prefix=C_in.step
    )
    end_run()

if __name__ == '__main__':
    main()

## Run the Analysis

Uncomment the cell below to run the full analysis:

In [None]:
# main()