# Meta Analysis Step 1: PCDS Platform Processing

This notebook processes PCDS (Oracle) database tables and extracts metadata.

**Purpose:**
- Connect to PCDS Oracle database
- Extract table metadata (columns, data types)
- Process row counts and date ranges
- Save results as pickle/JSON for Step 3

**Outputs:**
- `pcds_meta_results.pkl` - Complete metadata results
- `pcds_metadata.json` - Structured metadata for next steps

**Note:** This runs on Windows with PCDS access. No AWS credential renewal needed.

## Cell 1: Import Required Libraries

In [None]:
import re
import os
import csv
import json
import shutil
import pickle
import argparse
import warnings
import numpy as np
import pandas as pd
import pandas.io.sql as psql

from upath import UPath
from loguru import logger
from tqdm import tqdm
from datetime import datetime, timedelta, timezone
from dataclasses import dataclass, field, fields, is_dataclass
from configparser import ConfigParser
from confection import Config
from unittest import mock
from enum import Enum
from typing import Literal, Dict, List
from collections import defaultdict, abc
from dotenv import load_dotenv

warnings.filterwarnings("ignore", category=UserWarning, message='.*pandas only supports SQLAlchemy connectable.*')

# Note: This notebook uses Parquet format for cross-platform compatibility
# Install pyarrow if needed: pip install pyarrow or conda install -c conda-forge pyarrow
import pyarrow

## Cell 2: Constants and Configuration

In [None]:
# --- Global Constants ---
SEP = '; '
PCDS_DT_FORMAT = 'YYYY-MM-DD'
TODAY = datetime.now()
ONEDAY = timedelta(days=1)
WIDTH = 80
NO_DATE = 'no_date_provided'
inWindows = os.name == 'nt'

class PullStatus(Enum):
    """Enumeration for data pull status codes"""
    NONEXIST_PCDS = 'Nonexisting PCDS Table'
    NONDATE_PCDS = 'Nonexisting Date Variable in PCDS'
    EMPTY_PCDS = 'Empty PCDS Table'
    NO_MAPPING = 'Column Mapping Not Provided'
    SUCCESS = 'Successful Data Access'

# --- SQL Templates for PCDS (Oracle) ---
PCDS_SQL_META = """
select
    column_name,
    data_type || case
    when data_type = 'NUMBER' then 
        case when data_precision is NULL AND data_scale is NULL
            then NULL
        else
            '(' || TO_CHAR(data_precision) || ',' || TO_CHAR(data_scale) || ')'
        end
    when data_type LIKE '%CHAR%'
        then
            '(' || TO_CHAR(data_length) || ')'
        else NULL
    end AS data_type
from all_tab_cols
where table_name = UPPER('{table}')
order by column_id
"""

PCDS_SQL_NROW = """
SELECT COUNT(*) AS nrow FROM {table}
where {limit}
"""

PCDS_SQL_DATE = """
SELECT {date}, count(*) AS NROWS
FROM {table} 
WHERE {limit}
GROUP BY {date}
"""

## Cell 3: Exception Classes and Data Types

In [None]:
# --- Custom Exceptions ---
class NONEXIST_TABLE(Exception):
    """Exception raised when database view does not exist"""
    pass

class NONEXIST_DATEVAR(Exception):
    """Exception raised when no date-like variable exists"""
    pass

# --- Helper Functions for Configuration Reading ---
def read_str_lst(lst_str, sep='\n'):
    """Parse newline-separated string into list"""
    return [x for x in lst_str.strip().split(sep) if x]

def read_dstr_lst(dct_str, sep='='):
    """Parse key=value pairs into dictionary"""
    d = dict(line.split(sep, 1) for line in read_str_lst(dct_str))
    return {k.strip(): v.strip() for k, v in d.items()}

# --- Base Type Class ---
class BaseType:
    """Base class with logging and nested dataclass support"""
    def __post_init__(self):
        for _field in fields(self):
            if is_dataclass(_field.type):
                field_val = _field.type(**getattr(self, _field.name))
                setattr(self, _field.name, field_val)

    def tolog(self, indent=1, padding=''):
        """Convert dataclass to formatted string for logging"""
        import pprint as pp
        def get_val(x, pad):
            if isinstance(x, BaseType):
                return x.tolog(indent, pad)
            elif isinstance(x, Dict):
                return pp.pformat(x, indent)
            else:
                return repr(x)
        cls_name = self.__class__.__name__
        padding = padding + '\t' * indent
        fields_str = [f'{padding}{k}={get_val(v, padding)}' for k, v in vars(self).items()]
        return f'{cls_name}(\n' + ',\n'.join(fields_str) + '\n)'

# --- Configuration Dataclasses ---
@dataclass
class MetaRange:
    """Range configuration for row selection"""
    start_rows: int | None
    end_rows: int | None

    def __iter__(self):
        yield from [self.start_rows or 1, self.end_rows or float('inf')]

@dataclass
class MetaTable(BaseType):
    """Excel table configuration"""
    file: UPath
    sheet: str
    skip_rows: int
    select_cols: dict
    select_rows: dict

    def __post_init__(self):
        self.select_cols = read_dstr_lst(self.select_cols)
        self.select_rows = read_str_lst(str(self.select_rows))

@dataclass
class MetaInput(BaseType):
    """Input configuration"""
    name: str
    step: str
    env: str
    range: MetaRange
    category: Literal['loan', 'dpst']
    clear_cache: bool = True
    table: MetaTable = None

@dataclass
class MetaCSV:
    """CSV output configuration"""
    file: UPath
    columns: str
    
    def __post_init__(self):
        self.columns = read_str_lst(self.columns)

@dataclass
class S3Config:
    """S3 path configuration"""
    run: UPath
    data: UPath

@dataclass
class LogConfig:
    """Logging configuration"""
    level: Literal['info', 'warning', 'debug', 'error']
    format: str
    file: str
    overwrite: bool

    def todict(self):
        return {
            'level': self.level.upper(),
            'format': self.format,
            'sink': self.file,
            'mode': 'w' if self.overwrite else 'a'
        }

@dataclass
class NextConfig:
    """Next step configuration"""
    file: UPath
    fields: str
    
    def __post_init__(self):
        self.fields = read_dstr_lst(self.fields)

@dataclass
class CacheConfig:
    """Cache configuration (not used in this notebook)"""
    enable: bool
    directory: UPath
    expire_hours: int = None
    force_restart: bool = False
    verbose: bool = False

@dataclass
class MetaOutput(BaseType):
    """Output configuration"""
    folder: UPath
    to_pkl: UPath
    csv: MetaCSV
    to_s3: S3Config
    log: LogConfig
    next: NextConfig
    cache: CacheConfig

@dataclass
class MetaMatch:
    """Column matching configuration"""
    candidates: str
    drop_cols: dict
    add_cols: dict
    
    def __post_init__(self):
        self.candidates = read_str_lst(self.candidates)
        self.drop_cols = list(self.drop_cols)
        self.add_cols = list(self.add_cols)

@dataclass
class ColumnMap(BaseType):
    """Column mapping configuration"""
    output: UPath
    input: UPath
    na_str: str
    overwrite: bool
    excludes: list[str]
    pcds_col: str
    aws_col: str
    pcds_view: str
    aws_view: str

    def __post_init__(self):
        def transform(p):
            if isinstance(p, str):
                return ['_'.join(x for x in c.split()) for c in read_str_lst(p)]
            return p
        self.pcds_col = transform(self.pcds_col)
        self.pcds_view = transform(self.pcds_view)
        self.aws_col = transform(self.aws_col)
        self.aws_view = transform(self.aws_view)
        if '_+_' in self.aws_view[0]:
            self.aws_view = '.'.join('{%s}' % x.lower() for x in self.aws_view[0].split('_+_'))
        self.excludes = [x] if isinstance(self.excludes, str) else list(self.excludes) if self.excludes else []

@dataclass
class MetaConfig(BaseType):
    """Main configuration class"""
    input: MetaInput
    output: MetaOutput
    match: MetaMatch
    column_maps: ColumnMap

@dataclass
class MetaRecord:
    """Record tracking during processing"""
    next_d: dict = field(default_factory=dict)
    col_maps: dict = field(default_factory=dict)
    pull_status: PullStatus = None

## Cell 4: Configuration Reading Functions

In [None]:
#--- Patch confection library to preserve case sensitivity ---#
def patch_confection():
    def get_configparser(interpolate: bool = True):
        from confection import CustomInterpolation
        config = ConfigParser(
            interpolation=CustomInterpolation() if interpolate else None,
            allow_no_value=True,
        )
        config.optionxform = str
        return config
    mock_obj = mock.patch('confection.get_configparser', wraps=get_configparser)
    if not hasattr(mock_obj, 'is_local'):
        mock_obj.start()

#--- Read configuration file and create config object ---#
def read_config(config_class: BaseType, config_path: None | UPath | str = None, overrides={}):
    patch_confection()
    if UPath(config_path).is_file():
        config = Config().from_disk(config_path, overrides=overrides)
    else:
        config = Config().from_str(config_path, overrides=overrides)
    return config_class(**{**config.pop('root', {}), **config})

## Cell 5: Utility Classes and Functions

In [None]:
#--- Start logging session with separator ---#
def start_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- End logging session with separator ---#
def end_run():
    logger.info('\n\n' + '=' * WIDTH)

#--- Load environment variables from file ---#
def load_env(file):
    inWindows and load_dotenv(file)

class IO:
    """File I/O utility class - uses Parquet/JSON for cross-platform compatibility"""

    @staticmethod
    def write_dataframe(file, df):
        """Save DataFrame in portable Parquet format"""
        file = UPath(file)
        df.to_parquet(file, index=True, engine='pyarrow', compression='snappy')

    @staticmethod
    def read_dataframe(file):
        """Load DataFrame from Parquet format"""
        file = UPath(file)
        return pd.read_parquet(file, engine='pyarrow')

    @staticmethod
    def write_json(file, data, cls=None):
        """Save to JSON with proper serialization"""
        import numpy as np
        import pandas as pd
        import datetime as dt

        def convert(obj):
            if isinstance(obj, (np.integer, np.floating)):
                return obj.item()
            elif isinstance(obj, np.ndarray):
                return obj.tolist()
            elif pd.isna(obj):
                return None
            elif isinstance(obj, (dt.datetime, dt.date)):
                return obj.isoformat()
            elif isinstance(obj, set):
                return list(obj)
            raise TypeError(f"Object of type {type(obj)} is not JSON serializable")

        with open(file, 'w') as f:
            json.dump(data, f, indent=2, default=convert, cls=cls)

    @staticmethod
    def read_json(file):
        """Read JSON file into dictionary"""
        with open(file, 'r') as fp:
            data = json.load(fp)
        return data

    @staticmethod
    def write_pickle(file, data):
        """Deprecated: Use write_dataframe or write_json instead"""
        with open(file, 'wb') as f:
            pickle.dump(data, f)

    @staticmethod
    def read_pickle(file):
        """Deprecated: Use read_dataframe or read_json instead"""
        with open(file, 'rb') as fp:
            data = pickle.load(fp)
        return data

    @staticmethod
    def delete_file(file):
        """Delete file if it exists"""
        if (filepath := UPath(file)).exists():
            filepath.unlink()

class UDict(dict):
    """Case-insensitive dictionary for flexible key matching"""

    def __getitem__(self, key):
        return super().__getitem__(self._match(key))

    def __contains__(self, key):
        try:
            self._match(key)
            return True
        except KeyError:
            return False

    def _match(self, key):
        """Find matching key regardless of case"""
        for k in self:
            if k.lower() == key.lower():
                return k
        raise KeyError(key)

    def update(self, other=None, **kwargs):
        if other is not None:
            for k, v in other.items() if isinstance(other, abc.Mapping) else other:
                self[k] = v
        for k, v in kwargs.items():
            assert self._match(k)
            self[k] = v

    def get(self, key, default_value=None):
        try:
            return self[key]
        except KeyError:
            return default_value

class Misc:
    """Miscellaneous utility functions"""

    @staticmethod
    def convert2int(a):
        """Safely convert value to integer"""
        try:
            return int(a)
        except (TypeError, ValueError):
            return None

    @staticmethod
    def convert2datestr(a):
        """Convert datetime to string format"""
        if isinstance(a, datetime):
            return a.strftime('%Y-%m-%d')
        return a

## Cell 6: PCDS Database Connection and SQL Engine

In [None]:
#--- Connect to PCDS Oracle database ---#
def pcds_connect(service_name, ldap_service='X'):
    """Establish connection to PCDS Oracle database"""
    import oracledb
    # Map service names to connection strings
    svc2server = {
        'A': 'PBCS21P',
        'B': 'PBCS30P',
        'C': 'PCDS',
        'D': 'PBCS23P',
    }
    # Implement connection logic based on your environment
    # Example: return oracledb.connect(user=usr, password=pwd, dsn=dns_tns)
    raise NotImplementedError("Please implement PCDS connection logic")

class SQLengine:
    """SQL query engine for PCDS database"""
    
    def __init__(self, platform: Literal['PCDS']):
        self._platform = platform
        self.reset()

    def reset(self):
        """Reset internal state"""
        self._where = None
        self._type = None
        self._date = None
        self._dateraw = None
        self._table = None
        self._format = PCDS_DT_FORMAT

    def extract_var(self, stmt):
        """Extract variable names from SQL date expression"""
        def _extract_var():
            word, time, tagt = r'\w+_\w+', r"'[^']*'", r'[^,]+'
            pattern1 = fr"{word}\({word}\(({tagt}),\s*{time}\),\s*{time}\)"
            pattern2 = fr"{word}\(({tagt}),\s*{time}\)"
            if (m := re.match(pattern1, stmt)):
                return stmt, m.group(1)
            elif (m := re.match(pattern2, stmt)):
                return stmt, m.group(1)
            return stmt, stmt
        
        date_var, date_raw = _extract_var()
        return date_var, date_raw.upper()

    def query(self, query, connection, **query_kwargs):
        """Execute SQL query and return DataFrame"""
        query = self.clean_query(query)
        df = psql.read_sql_query(query, connection, **query_kwargs)
        
        #>>> Normalize column names to uppercase <<<#
        df.columns = [x.upper() for x in df.columns]
        return df

    def clean_query(self, query: str):
        """Clean and prepare SQL query for execution"""
        #>>> Extract table name from query <<<#
        table_pattern = r'([\w.]+)\s+MORF\b'
        self._table = re.search(table_pattern, query[::-1], flags=re.I).group(1)[::-1]
        
        #>>> Add alias to date column if needed <<<#
        date_pattern = r'(?!\\s+(?:AS\s+)\w+)'
        if self._date and (match := re.search(
            re.escape(self._date) + date_pattern,
            re.split(r'\b(?:FROM|WHERE)\b', query, flags=re.I)[0],
            flags=re.I
        )):
            st, ed = match.span()
            query = query[:st] + f'{self._date} as {self._dateraw}' + query[ed:]
        
        #>>> Remove empty WHERE clauses <<<#
        where_pattern = r'^\s*where\s*$'
        return re.sub(where_pattern, '', query, flags=re.I | re.M)

    def get_where_sql(self, date_var: str, date_type: str, start_dt=None, end_dt=None, where_cstr='') -> str:
        """Build WHERE clause for date filtering"""
        self._type = date_type
        
        #>>> Handle subquery in where constraint <<<#
        if not pd.isna(where_cstr) and (m := re.search(r'(?<=\()select.*(?=\))', where_cstr)):
            rhs = self.query_PCDS(m.group()).iloc[0, 0]
            if isinstance(rhs, str):
                where_cstr = "%s '%s'" % (where_cstr[:m.start() - 1], rhs)
            else:
                where_cstr = "%s '%s'" % (where_cstr[:m.start() - 1], rhs.strftime('%Y-%m-%d'))
        
        where_sql = [where_cstr]
        self.get_date_sql(date_var, date_type)
        
        #>>> Add date range filters <<<#
        if not pd.isna(start_dt):
            start_dt = Misc.convert2datestr(start_dt)
            where_sql.append(f"{self._date} >= '{start_dt}'")
        if not pd.isna(end_dt):
            end_dt = Misc.convert2datestr(end_dt)
            where_sql.append(f"{self._date} <= '{end_dt}'")
        
        #>>> Convert TO_CHAR comparisons to TO_DATE for PCDS <<<#
        for i, sql_stmt in enumerate(where_sql):
            if pd.isna(sql_stmt):
                continue
            if (match := re.match(r"^TO_CHAR\(([^,]+),\s*'(.*)'\)\s*([><=!]+)\s*'([^']+)'", sql_stmt)):
                a, b, c, d = match.groups()
                where_sql[i] = f"{a} {c} TO_DATE('{d}', '{b}')"
        
        self._where = ' AND '.join(x for x in where_sql if not pd.isna(x))

    @staticmethod
    def get_date_format(date_var):
        """Extract date format from variable specification"""
        pattern = r'^(.+?)(?:\s*\(([^)]+)\))?$'
        date_var, date_format = re.match(pattern, date_var).groups()
        return date_var, date_format

    def get_date_sql(self, date_var: str, date_type: str):
        """Convert date column to standard format in SQL"""
        date_var, date_format = self.get_date_format(date_var)
        is_date = re.search(r'time|date', date_type, re.IGNORECASE)
        
        #>>> Parse string dates if format provided <<<#
        if date_format and (not is_date):
            date_var = f"TO_DATE({date_var}, '{date_format}')"
            is_date = True
        
        #>>> Convert to standard string format <<<#
        if is_date:
            date_var = f"TO_CHAR({date_var}, 'YYYY-MM-DD')"
        
        self._date, self._dateraw = self.extract_var(date_var)

    def __repr__(self):
        return f'SQL({self._platform})\n' \
               f'   table: {self._table}\n' \
               f'   where: {self._where}\n' \
               f'   date : {self._date} ({self._dateraw})'

    def query_PCDS(self, query_stmt: str, service_name: str = None, **query_kwargs):
        """Execute query on PCDS database"""
        with pcds_connect(service_name=service_name) as CONN:
            return self.query(query_stmt, CONN, **query_kwargs)

## Cell 7: Excel Input Processing and Column Mapping

In [None]:
#--- Read and process Excel input file with table configurations ---#
def read_excel_input(config: MetaTable) -> pd.DataFrame:
    def trim_me(x):
        """Trim whitespace from strings"""
        return x.strip() if isinstance(x, str) else x

    def extract_name(name):
        """Remove parenthetical notes from names"""
        if pd.isna(name):
            return pd.NA
        if not isinstance(name, str):
            return name
        remove_extra = r'\(.*\)'
        return re.sub(remove_extra, '', name).strip()

    def merge_pcds_svc_tbl(df):
        """Combine service and table names into qualified names"""
        with warnings.catch_warnings():
            warnings.simplefilter('ignore', pd.errors.SettingWithCopyWarning)
            cols = [x for x in df.columns if x not in (
                'group', 'pcds_dt', 'aws_dt', 'pcds_where', 'aws_where'
            )]
            df[cols] = df[cols].map(extract_name)
            tbl = df.pop('pcds_tbl')
            df['col_map'] = df['col_map'].fillna(tbl).copy()
            svc = df.pop('pcds_svc').fillna('no_server_provided')
            df.loc[:, ['pcds_tbl']] = svc + '.' + tbl.str.lower()
            df['pcds_dt'] = df['pcds_dt'].copy().fillna(NO_DATE)
            df['aws_dt'] = df['aws_dt'].copy().fillna(NO_DATE)
            df[['pcds_where', 'aws_where']] = df[['pcds_where', 'aws_where']].replace(np.nan, None)

    file_path = config.file
    try:
        df = pd.read_excel(
            file_path, sheet_name=config.sheet,
            skiprows=config.skip_rows, usecols=list(config.select_cols)
        )
        df = df.rename(columns=config.select_cols).map(trim_me)
        if len(config.select_rows) > 0:
            df = df.query(' & '.join(config.select_rows))
        merge_pcds_svc_tbl(df)
        logger.info(f"Read {len(df)} rows from {file_path}")
        return df
    except Exception as e:
        logger.error(f"Failed to read Excel file {file_path}: {e}")
        raise

class ColmapUtils:
    """Utility class for processing column mapping files"""
    
    def __init__(self, category: Literal['loan', 'dpst']):
        self.category = category
        if category == 'dpst':
            self._obtain_table = self.process_dpst
        elif category == 'loan':
            self._obtain_table = self.process_loan

    def is_column_tokenized(self, row):
        """Check if column contains PII that should be tokenized"""
        if row.get('pii_encryption', None) == 'Y':
            return True
        try:
            return 'tokenise' in row.get('note', "").lower()
        except (TypeError, AttributeError):
            return False

    def process_loan(self, config: ColumnMap):
        """Process loan category mapping file"""
        all_sheets = pd.read_excel(config.input, sheet_name=None)
        yield from all_sheets.items()

    def process_dpst(self, config: ColumnMap):
        """Process deposit category mapping file"""
        all_sheets = pd.read_excel(config.input, sheet_name='Column_Details')
        return all_sheets.groupby(config.pcds_view[0])

    def process(self, config: ColumnMap) -> UDict:
        """Process column mapping configuration file"""
        if os.path.exists(config.output) and (not config.overwrite):
            return IO.read_json(config.output)
        
        table_excludes = config.excludes or []
        mappings = {}
        
        #>>> Process each table's column mappings <<<#
        for pcds_name, df in self._obtain_table(config):
            if pcds_name in table_excludes:
                continue
            
            if self.category == 'loan' and 'Source' in df.columns:
                df = pd.DataFrame(df.iloc[1:].values, columns=df.iloc[0])

            df = df.rename(columns=self.colfunc)
            pcds2aws, aws2pcds = {}, {}
            s = {
                'aws_unique': [],
                'pcds_unique': [],
                'duplicated_pcds': set(),
                'duplicated_aws': set(),
                'pii_cols': set()
            }
            
            #>>> Build bidirectional column mappings <<<#
            for i, row in enumerate(df.itertuples()):
                row = UDict(**row._asdict())
                if i == 0:
                    if self.category == 'loan':
                        pcds_name = self.fetchcol(row, config.pcds_view, config.na_str)
                        aws_name = self.fetchcol(row, config.aws_view, config.na_str)
                    elif self.category == 'dpst':
                        aws_name = config.aws_view.format(**row)
                
                pcds_col = self.fetchcol(row, config.pcds_col, config.na_str)
                aws_col = self.fetchcol(row, config.aws_col, config.na_str)
                pcds_na, aws_na = pd.isna(pcds_col), pd.isna(aws_col)
                
                if pcds_na and aws_na:
                    continue
                elif pcds_na:
                    s['aws_unique'].append(aws_col.lower())
                elif aws_na:
                    s['pcds_unique'].append(pcds_col.upper())
                else:
                    pcds_col, aws_col = pcds_col.upper(), aws_col.lower()
                    
                    #>>> Check for duplicate AWS columns <<<#
                    has_dupl, aws_dup = self.get_duplicates(aws_col, pcds_col, aws2pcds, s['aws_unique'])
                    if has_dupl:
                        logger.warning(f'Table {pcds_name} has duplicated AWS column {aws_col}')
                        s['duplicated_aws'] |= aws_dup
                    else:
                        aws2pcds[aws_col] = pcds_col

                    #>>> Check for duplicate PCDS columns <<<#
                    has_dupl, pcds_dup = self.get_duplicates(pcds_col, aws_col, pcds2aws, s['pcds_unique'])
                    if has_dupl:
                        logger.warning(f'Table {pcds_name} has duplicated PCDS column {pcds_col}')
                        s['duplicated_pcds'] |= pcds_dup
                    else:
                        pcds2aws[pcds_col] = aws_col
                
                if self.is_column_tokenized(row):
                    s['pii_cols'].add(pcds_col)
            
            if len(pcds2aws) == 0:
                logger.info(f"No match key is found in {pcds_name}")
            
            s['duplicated_pcds'] = list(s['duplicated_pcds'])
            s['duplicated_aws'] = list(s['duplicated_aws'])
            s['pii_cols'] = list(x for x in s['pii_cols'] if pd.notna(x))
            mappings[pcds_name] = {
                'pcds_table': pcds_name,
                'aws_table': aws_name,
                'pcds2aws': pcds2aws,
                **s
            }
        
        IO.write_json(config.output, mappings)
        return UDict(mappings)

    @staticmethod
    def get_duplicates(col_a, col_b, a2b: dict, unique_a: list):
        """Check for duplicate column mappings"""
        exist_key = set(list(a2b) + unique_a)
        exist_val = a2b.get(col_a, pd.NA)
        if col_a in exist_key and col_b != exist_val:
            return True, {f'{col_a}:{exist_val}', f'{col_a}:{col_b}'}
        return False, None

    @staticmethod
    def colfunc(col):
        """Normalize column names"""
        if pd.isna(col):
            return 'comment'
        col = col.split('\n')[-1]
        return '_'.join(x.lower() for x in col.split())

    @staticmethod
    def fetchcol(row, names, na_str):
        """Fetch first non-null column value from list of column names"""
        for name in names:
            name = name.lower()
            if (name in row) and (not pd.isna(row[name])) and (row[name] != na_str):
                return row[name].strip()
        return pd.NA

## Cell 8: PCDS Processing Functions

In [None]:
# Initialize global objects
proc_pcds = SQLengine('PCDS')
record = MetaRecord()

#--- Process PCDS table metadata (columns and row count) ---#
def process_pcds_meta(row, rename_columns={}):
    service, table = (info_str := row.pcds_tbl).split('.', maxsplit=1)
    logger.info(f"\tStart processing {info_str}")
    
    #>>> Query column metadata and row counts <<<#
    try:
        with pcds_connect(service) as CONN:
            df_type = proc_pcds.query(PCDS_SQL_META.format(table=table), CONN)
            if hasattr(df_type, 'last_modified'):
                record.next_d.update(last_modified=df_type.last_modified)
            
            #>>> Extract date variable and build WHERE clause <<<#
            date_var = re.match(r'(\w+)(?=\s*\()?', row.pcds_dt).group(1)
            if date_var == NO_DATE:
                proc_pcds._where = row.pcds_where
            else:
                proc_pcds.get_where_sql(
                    date_var=row.pcds_dt,
                    date_type=df_type.query(f"COLUMN_NAME == '{date_var.upper()}'")['DATA_TYPE'].item(),
                    start_dt=row.start_dt,
                    end_dt=row.end_dt,
                    where_cstr=row.pcds_where
                )
            
            nrow_sql = PCDS_SQL_NROW.format(table=table, limit=proc_pcds._where)
            df_nrow = proc_pcds.query(nrow_sql, CONN)
    except (pd.errors.DatabaseError, ValueError):
        logger.warning(f"Couldn't find {table.upper()} in {service.upper()}")
        raise NONEXIST_TABLE("PCDS View Not Existing")
    
    df_type.columns = [x.lower() for x in df_type.columns]
    df_type['aws_colname'] = df_type['column_name'].map(rename_columns)
    return {'column': df_type, 'row': df_nrow}, len(rename_columns) > 0

#--- Query PCDS table for date-wise row counts ---#
def process_pcds_date(row):
    service, table = (info_str := row.pcds_tbl).split('.', maxsplit=1)
    
    try:
        with pcds_connect(service) as CONN:
            date_sql = PCDS_SQL_DATE.format(
                table=table, limit=proc_pcds._where, date=proc_pcds._date
            )
            df_meta = proc_pcds.query(date_sql, CONN)
        logger.info(f"\tFinish Processing {info_str}")
    except pd.errors.DatabaseError:
        if proc_pcds._dateraw:
            logger.warning(f"Column {proc_pcds._dateraw.upper()} not found in {table.upper()}")
        raise NONEXIST_DATEVAR("Date-like Variable Not In PCDS")
    return df_meta

#--- Initialize output folders and logging ---#
def start_setup(start_row, C_out):
    try:
        assert start_row <= 1
        os.remove(C_out.csv.file)
    except (TypeError, AssertionError, FileNotFoundError):
        pass
    os.makedirs(C_out.folder, exist_ok=True)
    logger.add(**C_out.log.todict())

## Cell 9: Configuration Parsing

In [None]:
#--- Parse command line arguments and load configuration ---#
def parse_config():
    parser = argparse.ArgumentParser(description='Conduct Meta Info Analysis - PCDS Step')
    parser.add_argument(
        '--category',
        choices=['loan', 'dpst'],
        default='dpst',
        help='which meta template to use',
    )
    parser.add_argument(
        '--name', type=str,
        default='test_0827',
        help='how to name this analysis (override)'
    )
    parser.add_argument(
        '--query', type=str,
        default='group == "test_0827"',
        help='how to name this analysis (override)'
    )
    args = parser.parse_args()

    if args.category == 'dpst':
        config_path = r'files/inputs/config_meta_dpst.cfg'
    elif args.category == 'loan':
        config_path = r'files/inputs/config_meta_loan.cfg'
    
    config = read_config(
        MetaConfig,
        config_path=config_path,
        overrides={
            'input.table.select_rows': args.query,
            'input.name': args.name
        }
    )
    (out_folder := UPath(config.output.folder)).mkdir(exist_ok=True)
    shutil.copy(config_path, out_folder.joinpath(f'{config.input.step}_pcds.cfg'))
    return config

## Cell 10: Main Execution - PCDS Processing

In [None]:
def main():
    """Main execution function for PCDS meta analysis"""
    config = parse_config()
    df_dict, df_next = {}, {}
    C_out, C_in = config.output, config.input
    start_row, end_row = C_in.range
    start_setup(start_row, C_out)
    logger.info('Configuration:\n' + config.tolog())

    #>>> Load environment variables <<<#
    load_env(C_in.env)
    start_run()

    #>>> Load input table list and column mappings <<<#
    tbl_list = (
        read_excel_input(C_in.table)
        .groupby('pcds_tbl')
        .first()
        .reset_index()
    )
    record.col_maps = (
        ColmapUtils(C_in.category)
        .process(config.column_maps)
    )

    #>>> Process each table <<<#
    total = len(tbl_list)
    for i, row in enumerate(tqdm(
        tbl_list.itertuples(), desc='Processing PCDS ...', total=total
    ), start=1):
        if (i < start_row or i > end_row):
            continue

        pcds_m, pcds_d = {}, None
        record.next_d = UDict(C_out.next.fields)
        name: str = row.pcds_tbl.split('.')[1].lower()

        logger.info(f">>> Start {name}")

        #>>> Initialize record for this table <<<#
        record.next_d.update(
            pcds_tbl=row.pcds_tbl,
            aws_tbl=row.aws_tbl,
            pcds_dt=row.pcds_dt,
            aws_dt=row.aws_dt,
            partition=row.partition,
            last_modified=datetime.now().strftime('%Y-%m-%d'),
            tokenised_cols=[]
        )

        pull_status = PullStatus.SUCCESS

        #>>> Try to pull PCDS table metadata <<<#
        try:
            rename_columns = {}
            if (c := row.col_map) and (c in record.col_maps):
                meta_info = record.col_maps[c]
                rename_columns = {
                    k: v for k, v in meta_info['pcds2aws'].items()
                    if k not in meta_info['pii_cols']
                }
                record.next_d.update(tokenised_cols=meta_info['pii_cols'].copy())
            pcds_m, exist_mapping = process_pcds_meta(row, rename_columns)
        except NONEXIST_TABLE:
            pull_status = PullStatus.NONEXIST_PCDS
            logger.error(f"PCDS table {name} does not exist")
            continue

        #>>> Check if column mapping was provided <<<#
        if pull_status == PullStatus.SUCCESS and (not exist_mapping):
            pull_status = PullStatus.NO_MAPPING

        #>>> Try to get date-wise counts from PCDS <<<#
        try:
            pcds_d = process_pcds_date(row)
            if len(pcds_m) == 0:
                pull_status = PullStatus.EMPTY_PCDS
        except NONEXIST_DATEVAR:
            if pull_status == PullStatus.SUCCESS:
                pull_status = PullStatus.NONDATE_PCDS

        #>>> Store results <<<#
        df_dict[name] = {
            'pcds_meta': pcds_m,
            'pcds_date': pcds_d,
            'status': pull_status.value,
            'sql_engine': {
                'where': proc_pcds._where,
                'date': proc_pcds._date,
                'dateraw': proc_pcds._dateraw,
                'type': proc_pcds._type
            }
        }

        #>>> Save metadata for next step <<<#
        if pcds_m and 'column' in pcds_m:
            record.next_d.update(
                pcds_cols=SEP.join(pcds_m['column']['column_name'].tolist()),
                pcds_types=SEP.join(pcds_m['column']['data_type'].tolist()),
                pcds_nrows=int(pcds_m['row'].iloc[0, 0]) if 'row' in pcds_m else 0,
                pcds_where=proc_pcds._where,
                pcds_dt_type=proc_pcds._type
            )

        df_next[name] = record.next_d.copy()

        #>>> Reset engine for next iteration <<<#
        proc_pcds.reset()

    #>>> Save results using Parquet/JSON format <<<#
    output_folder = UPath(C_out.folder)

    # Save individual parquet files for each table
    for table_name, table_data in df_dict.items():
        if table_data['pcds_meta'] and 'column' in table_data['pcds_meta']:
            # Save column info as parquet
            col_file = output_folder / f'pcds_column_info_{table_name}.parquet'
            IO.write_dataframe(col_file, table_data['pcds_meta']['column'])

        if table_data['pcds_date'] is not None:
            # Save date counts as parquet
            date_file = output_folder / f'pcds_date_counts_{table_name}.parquet'
            IO.write_dataframe(date_file, table_data['pcds_date'])

        # Save other metadata as JSON
        meta_file = output_folder / f'pcds_metadata_{table_name}.json'
        IO.write_json(meta_file, {
            'status': table_data['status'],
            'sql_engine': table_data['sql_engine'],
            'nrows': table_data['pcds_meta'].get('row', pd.DataFrame()).to_dict() if table_data['pcds_meta'] else {}
        })

    # Save summary with file paths
    summary_data = {
        name: {
            'column_file': str(output_folder / f'pcds_column_info_{name}.parquet'),
            'date_file': str(output_folder / f'pcds_date_counts_{name}.parquet'),
            'meta_file': str(output_folder / f'pcds_metadata_{name}.json'),
            'status': data['status']
        } for name, data in df_dict.items()
    }
    summary_file = output_folder / 'pcds_summary.json'
    IO.write_json(summary_file, summary_data)

    # Save next step metadata
    IO.write_json(output_folder / 'pcds_metadata.json', df_next)

    logger.info(f"Saved PCDS results to {output_folder}")
    logger.info(f"  - Individual parquet/JSON files: {len(df_dict)} tables")
    logger.info(f"  - pcds_summary.json: summary of all files")
    logger.info(f"  - pcds_metadata.json: metadata for next steps")

    end_run()

if __name__ == '__main__':
    main()

## Run the Analysis

Uncomment the cell below to run the PCDS processing:

In [None]:
# main()