# Testing BaseTable Implementation and Enhanced Validation

This notebook demonstrates the new BaseTable class implementation with comprehensive validation features for pyCLIF.

In [None]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Add parent directory to path to import pyclif
sys.path.insert(0, os.path.abspath('..'))

from src.pyclif.clif import CLIF
from src.pyclif.tables.patient import patient
from src.pyclif.tables.vitals import vitals
from src.pyclif.tables.labs import labs

## 1. Initialize CLIF with Output Directory

The new CLIF class now supports an output directory for validation reports.

In [None]:
# Set your data directory path
data_dir = '../demo_data'  # Update this to your actual data directory
output_dir = '../validation_output'  # Directory for validation outputs

# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)

# Initialize CLIF with output directory
clif = CLIF(
    data_dir=data_dir,
    filetype='parquet',  # or 'csv' depending on your data
    timezone='UTC',
    output_dir=output_dir
)

## 2. Load Tables with New BaseTable Implementation

All tables now inherit from BaseTable and have enhanced validation capabilities.

In [None]:
# Load patient data
patient_table = clif.load_patient_data(sample_size=1000)
print(f"Patient table loaded: {len(patient_table.df)} rows")
print(f"Validation status: {'Valid' if patient_table.isvalid() else 'Invalid'}")

In [None]:
# Load vitals data
vitals_table = clif.load_vitals_data(sample_size=5000)
print(f"Vitals table loaded: {len(vitals_table.df)} rows")
print(f"Validation status: {'Valid' if vitals_table.isvalid() else 'Invalid'}")

In [None]:
# Load multiple tables at once
clif.initialize(
    tables=['hospitalization', 'adt', 'labs'],
    sample_size=1000
)

## 3. Run Comprehensive Validation

The new validation includes:
- Schema validation
- Missing data analysis
- Duplicate checking on composite keys
- Statistical analysis
- Unit validation (for labs and vitals)
- Range validation (for vitals)

In [None]:
# Run validation on all loaded tables
validation_summary = clif.validate_all()

In [None]:
# Display detailed validation results for a specific table
if clif.vitals and not clif.vitals.isvalid():
    print("Vitals validation errors:")
    for error in clif.vitals.errors[:5]:  # Show first 5 errors
        print(f"  - {error}")

## 4. Direct Table Creation with BaseTable

You can also create tables directly using the new BaseTable signature.

In [None]:
# Create a patient table directly with the new signature
patient_direct = patient.from_file(
    data_directory=data_dir,
    filetype='parquet',
    timezone='UTC',
    output_directory=output_dir,
    sample_size=500
)

print(f"Direct patient table loaded: {len(patient_direct.df)} rows")
print(f"Table name: {patient_direct.table_name}")
print(f"Output directory: {patient_direct.output_directory}")

## 5. Access Enhanced Validation Reports

Validation reports are automatically saved to the output directory.

In [None]:
# Get table summary
if clif.patient:
    summary = clif.patient.get_summary()
    print("Patient Table Summary:")
    print(f"  Rows: {summary['num_rows']}")
    print(f"  Columns: {summary['num_columns']}")
    print(f"  Memory Usage: {summary['memory_usage_mb']:.2f} MB")
    print(f"  Validation Errors: {summary['validation_errors']}")
    print(f"  Is Valid: {summary['is_valid']}")
    
    # Save summary to file
    clif.patient.save_summary()

In [None]:
# List generated validation files
print("Validation output files:")
for file in os.listdir(output_dir):
    if file.endswith(('.csv', '.log', '.json')):
        print(f"  - {file}")

## 6. Using Enhanced Validator Functions

The enhanced validator module provides comprehensive validation functions.

In [None]:
from src.pyclif.utils import validator

# Example: Calculate missing data statistics
if clif.vitals:
    missing_stats = validator.calculate_missing_stats(clif.vitals.df, format='long')
    print("\nMissing Data Statistics (Top 5 columns):")
    print(missing_stats.head())

In [None]:
# Example: Check for duplicates on composite keys
if clif.vitals:
    duplicate_check = validator.check_for_duplicates(
        clif.vitals.df,
        composite_keys=['hospitalization_id', 'recorded_dttm', 'vital_category']
    )
    print("\nDuplicate Check Results:")
    print(f"  Total rows: {duplicate_check['total_rows']}")
    print(f"  Duplicate rows: {duplicate_check['duplicate_rows']}")
    print(f"  Has duplicates: {duplicate_check['has_duplicates']}")

In [None]:
# Example: Calculate cohort sizes
if clif.hospitalization:
    cohort_sizes = validator.calculate_cohort_sizes(
        clif.hospitalization.df,
        id_columns=['patient_id', 'hospitalization_id']
    )
    print("\nCohort Sizes:")
    for key, value in cohort_sizes.items():
        print(f"  {key}: {value}")

## 7. Vitals-Specific Range Validation

The vitals table has special range validation for vital signs.

In [None]:
if clif.vitals:
    # Access vital ranges from schema
    print("Vital Ranges from Schema:")
    for vital, ranges in clif.vitals.vital_ranges.items():
        print(f"  {vital}: {ranges}")
    
    # Check range validation errors
    if hasattr(clif.vitals, 'range_validation_errors'):
        print(f"\nRange validation errors: {len(clif.vitals.range_validation_errors)}")
        for error in clif.vitals.range_validation_errors[:3]:
            print(f"  - {error}")

## 8. Labs-Specific Unit Validation

The labs table validates reference units against expected values.

In [None]:
if clif.lab:
    # Access lab reference units from schema
    print("Lab Reference Units (first 5):")
    for i, (lab, units) in enumerate(clif.lab.lab_reference_units.items()):
        if i >= 5:
            break
        print(f"  {lab}: {units}")

## 9. Backward Compatibility

The refactored classes maintain backward compatibility with the old signature.

In [None]:
# Create sample data for testing backward compatibility
sample_data = pd.DataFrame({
    'patient_id': ['P001', 'P002', 'P003'],
    'birth_date': pd.to_datetime(['1980-01-01', '1975-05-15', '1990-12-20']),
    'death_dttm': [pd.NaT, pd.NaT, pd.NaT],
    'race_name': ['White', 'Black', 'Asian'],
    'race_category': ['White', 'Black or African American', 'Asian'],
    'ethnicity_name': ['Non-Hispanic', 'Non-Hispanic', 'Hispanic'],
    'ethnicity_category': ['Non-Hispanic', 'Non-Hispanic', 'Hispanic'],
    'sex_name': ['Male', 'Female', 'Male'],
    'sex_category': ['Male', 'Female', 'Male'],
    'language_name': ['English', 'English', 'Spanish'],
    'language_category': ['English', 'English', 'Spanish']
})

# Old signature still works (for backward compatibility)
patient_old_style = patient(data=sample_data)
print(f"Old style initialization works: {patient_old_style.df.shape}")
print(f"Validation status: {'Valid' if patient_old_style.isvalid() else 'Invalid'}")

## 10. Summary

The new BaseTable implementation provides:

1. **Unified Interface**: All tables inherit from BaseTable with consistent methods
2. **Enhanced Validation**: Comprehensive data quality checks including:
   - Schema validation
   - Missing data analysis
   - Duplicate detection
   - Statistical summaries
   - Domain-specific validations (ranges, units)
3. **Structured Logging**: All validation activities are logged
4. **Output Management**: Validation reports saved to specified directory
5. **Backward Compatibility**: Old code continues to work
6. **YAML Schemas**: More readable and maintainable than JSON
7. **Composite Keys**: Defined in schemas for duplicate detection

All validation outputs are saved in the specified output directory for post-processing.