In [2]:
# Import required libraries
import pandas as pd
import numpy as np
import os
import json
import warnings
from datetime import datetime
from pathlib import Path

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set pandas display options for better visualization
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', 50)

# Set up file paths with validation
try:
    BASE_PATH = Path(__file__).parent.parent if '__file__' in globals() else Path.cwd().parent
    if not BASE_PATH.exists():
        BASE_PATH = Path(r'c:\NATIONAL AMR DATA ANALYSIS FILES')
    
    DATA_PATH = BASE_PATH / 'data'
    RAW_DATA_PATH = DATA_PATH / 'raw'
    PROCESSED_DATA_PATH = DATA_PATH / 'processed'
    REFERENCE_DATA_PATH = DATA_PATH / 'Database Resources'
    
    # Validate all paths exist
    for path_name, path_obj in [
        ('Base', BASE_PATH),
        ('Data', DATA_PATH),
        ('Raw Data', RAW_DATA_PATH),
        ('Reference Data', REFERENCE_DATA_PATH)
    ]:
        if not path_obj.exists():
            print(f"⚠️  Warning: {path_name} path does not exist: {path_obj}")
    
    # Create processed data directory if it doesn't exist
    PROCESSED_DATA_PATH.mkdir(exist_ok=True)
    
    print("✅ Libraries imported successfully")
    print(f"✅ Base path: {BASE_PATH}")
    print(f"✅ Data path: {DATA_PATH}")
    print(f"✅ Reference data path: {REFERENCE_DATA_PATH}")
    
except Exception as e:
    print(f"❌ Error setting up paths: {e}")
    raise

✅ Libraries imported successfully
✅ Base path: c:\NATIONAL AMR DATA ANALYSIS FILES
✅ Data path: c:\NATIONAL AMR DATA ANALYSIS FILES\data
✅ Reference data path: c:\NATIONAL AMR DATA ANALYSIS FILES\data\Database Resources


### WHO GLASS Quality Standards Configuration

Define WHO GLASS specific quality standards and validation criteria.

In [3]:
# WHO GLASS Essential Fields and Standards Configuration
print("🔧 Configuring WHO GLASS Standards...")

# WHO GLASS Essential Fields (as per WHO GLASS Manual v2.1)
GLASS_ESSENTIAL_FIELDS_ORIGINAL = [
    'PATIENT_ID',    # Patient identifier
    'ORGANISM',      # Organism identification
    'SPEC_DATE',     # Specimen collection date
    'COUNTRY_A',     # Country code
    'INSTITUT',      # Healthcare institution
    'DEPARTMENT',    # Hospital department
    'AGE',           # Patient age
    'SEX'            # Patient sex/gender
]

# Actual field names in our dataset (AMR_DATA_FINAL.csv)
GLASS_ESSENTIAL_FIELDS_MAPPED = [
    'PATIENT_ID',    # Patient identifier (unchanged)
    'ORGANISM',      # Organism identification (unchanged)
    'SPEC_DATE',     # Specimen collection date (unchanged)
    'COUNTRY_A',     # Country code (unchanged)
    'INSTITUT',      # Healthcare institution (unchanged)
    'DEPARTMENT',    # Hospital department (unchanged)
    'AGE',           # Patient age (unchanged)
    'SEX'            # Patient sex/gender (unchanged)
]

# Column mapping dictionary (minimal mapping needed since fields match WHO GLASS standards)
COLUMN_MAPPING = {
    # Most fields are already in WHO GLASS format, minimal mapping required
    'INSTITUT': 'INSTITUTION',      # Standardize institution name
    'COUNTRY_A': 'COUNTRY',         # Standardize country field name
    'ORGANISM': 'ORGANISM_CODE',   # Make organism source explicit
}

# Note: The following fields are already appropriately named and need no mapping:
# - PATIENT_ID, SPEC_DATE, DEPARTMENT, AGE, SEX, REGION, ORG_TYPE

# Additional data fields available in the dataset
ADDITIONAL_DATA_FIELDS = [
    'ROW_IDX',       # Row identifier
    'REGION',        # Geographic region
    'ORG_TYPE'       # Organism type classification
]

# AST (Antimicrobial Susceptibility Testing) columns in the dataset
AST_COLUMNS_RAW = [
    'AMC_ND20', 'AMK_ND30', 'AMP_ND10', 'AMX_ND30', 'AZM_ND15', 'CAZ_ND30',
    'CHL_ND30', 'CIP_ND5', 'CLI_ND2', 'CLO_ND5', 'CRO_ND30', 'CTX_ND30',
    'CXM_ND30', 'ERY_ND15', 'ETP_ND10', 'FEP_ND30', 'FLC_ND', 'FOX_ND30',
    'GEN_ND10', 'LEX_ND30', 'LIN_ND4', 'LNZ_ND30', 'LVX_ND5', 'MEM_ND10',
    'MNO_ND30', 'OXA_ND1', 'PEN_ND10', 'PNV_ND10', 'RIF_ND5', 'SXT_ND1_2',
    'TCY_ND30', 'TGC_ND15', 'TZP_ND100', 'VAN_ND30'
]

# WHO GLASS Quality Thresholds
GLASS_QUALITY_THRESHOLDS = {
    'minimum_completeness': 80,    # Minimum completeness for essential fields
    'temporal_coverage_months': 12, # Minimum months of data collection
    'minimum_isolates': 100,       # Minimum isolates for meaningful analysis
    'ast_completeness': 70         # Minimum AST completeness
}

# WHO Age Categories (as per WHO GLASS)
GLASS_AGE_CATEGORIES = {
    'Neonates': '0-27 days',
    'Children': '28 days - 17 years',
    'Adults': '18+ years',
    'Unknown': 'Missing/Invalid age'
}

# WHO GLASS Specimen Types (common types) - Note: Dataset uses department types instead
GLASS_SPECIMEN_TYPES = {
    'BLOOD': 'Blood culture',
    'URINE': 'Urine culture', 
    'WOUND': 'Wound/soft tissue',
    'RESP': 'Respiratory specimen',
    'CSF': 'Cerebrospinal fluid',
    'OTHER': 'Other specimen types'
}

# Department types actually present in the dataset
DEPARTMENT_TYPES = {
    'Inp': 'Inpatient',
    'Out': 'Outpatient'
}

# WHO AWARE Categories for antimicrobials
AWARE_CATEGORIES = ['Access', 'Watch', 'Reserve', 'Not Listed']

print("✅ WHO GLASS configuration completed")
print(f"📋 Essential fields configured: {len(GLASS_ESSENTIAL_FIELDS_ORIGINAL)}")
print(f"🎯 Quality thresholds set: {len(GLASS_QUALITY_THRESHOLDS)}")
print(f"👶 Age categories defined: {len(GLASS_AGE_CATEGORIES)}")
print(f"💊 AWARE categories: {len(AWARE_CATEGORIES)}")
print(f"🧪 AST columns available: {len(AST_COLUMNS_RAW)}")
print(f"📊 Additional data fields: {len(ADDITIONAL_DATA_FIELDS)}")
print(f"🏥 Department types: {len(DEPARTMENT_TYPES)}")
print(f"🔍 Total columns in dataset: {len(GLASS_ESSENTIAL_FIELDS_ORIGINAL) + len(AST_COLUMNS_RAW) + len(ADDITIONAL_DATA_FIELDS)}")

🔧 Configuring WHO GLASS Standards...
✅ WHO GLASS configuration completed
📋 Essential fields configured: 8
🎯 Quality thresholds set: 4
👶 Age categories defined: 4
💊 AWARE categories: 4
🧪 AST columns available: 34
📊 Additional data fields: 3
🏥 Department types: 2
🔍 Total columns in dataset: 45


In [4]:

# =============================================================================
# Data Loading and Initial Processing
# =============================================================================

print("\n" + "="*70)
print("📂 LOADING RAW AMR DATA")
print("="*70)

# Load the raw AMR data
raw_data_file = RAW_DATA_PATH / 'AMR_DATA_FINAL.csv'

try:
    print(f"📊 Loading data from: {raw_data_file}")
    df_raw = pd.read_csv(raw_data_file)
    
    print(f"✅ Data loaded successfully!")
    print(f"📊 Dataset shape: {df_raw.shape}")
    print(f"📋 Columns: {df_raw.columns.tolist()}")
    
except FileNotFoundError:
    print(f"❌ Error: Raw data file not found at {raw_data_file}")
    raise
except Exception as e:
    print(f"❌ Error loading data: {e}")
    raise

# =============================================================================
# WHO GLASS Column Mapping and Standardization
# =============================================================================

print("\n" + "="*70)
print("🔄 WHO GLASS COLUMN MAPPING & STANDARDIZATION")
print("="*70)

# Create a copy for processing
df_cleaned = df_raw.copy()

print("\n1. Column Mapping Analysis")
print("-" * 40)

# Check which columns need mapping
print("Current columns requiring standardization:")
columns_to_map = []
for original_col, mapped_col in COLUMN_MAPPING.items():
    if original_col in df_cleaned.columns:
        columns_to_map.append((original_col, mapped_col))
        print(f"  • {original_col} → {mapped_col}")
    else:
        print(f"  ⚠️  {original_col} not found in dataset")

print(f"\n📊 Total columns requiring mapping: {len(columns_to_map)}")

print("\n2. Applying Column Mapping")
print("-" * 40)

# Apply the column mapping
columns_mapped = 0
mapping_log = []

for original_col, mapped_col in columns_to_map:
    if original_col in df_cleaned.columns:
        # Apply the mapping
        df_cleaned = df_cleaned.rename(columns={original_col: mapped_col})
        columns_mapped += 1
        mapping_log.append(f"✓ {original_col} → {mapped_col}")
        print(f"  ✓ {original_col} → {mapped_col}")
    else:
        mapping_log.append(f"✗ {original_col} (not found)")
        print(f"  ✗ {original_col} (not found)")

print(f"\n✅ Column mapping completed!")
print(f"📊 Successfully mapped: {columns_mapped}/{len(COLUMN_MAPPING)} columns")

print("\n3. Updated Essential Fields Verification")
print("-" * 40)

# Update the essential fields list to use mapped names
GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED = []
for field in GLASS_ESSENTIAL_FIELDS_ORIGINAL:
    mapped_field = COLUMN_MAPPING.get(field, field)
    GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED.append(mapped_field)

print("WHO GLASS Essential Fields (after mapping):")
field_status = []
for i, field in enumerate(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED):
    if field in df_cleaned.columns:
        status = "✅ Available"
        field_status.append(True)
    else:
        status = "❌ Missing"
        field_status.append(False)
    
    original_field = GLASS_ESSENTIAL_FIELDS_ORIGINAL[i]
    if original_field != field:
        print(f"  {i+1}. {field} (was: {original_field}) - {status}")
    else:
        print(f"  {i+1}. {field} - {status}")

available_fields = sum(field_status)
print(f"\n📊 Essential fields status: {available_fields}/{len(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED)} available")

if available_fields == len(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED):
    print("🎯 All WHO GLASS essential fields are present!")
else:
    missing_fields = [field for field, status in zip(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED, field_status) if not status]
    print(f"⚠️  Missing essential fields: {missing_fields}")

print("\n4. Final Dataset Summary")
print("-" * 40)
print(f"📊 Final dataset shape: {df_cleaned.shape}")
print(f"📋 Final column count: {len(df_cleaned.columns)}")
print(f"🔄 Columns renamed: {columns_mapped}")
print(f"✅ Data cleaning step 1 (Column Mapping) completed!")

# Save mapping log for documentation
mapping_summary = {
    'timestamp': datetime.now().isoformat(),
    'original_columns': len(df_raw.columns),
    'final_columns': len(df_cleaned.columns),
    'columns_mapped': columns_mapped,
    'mapping_details': mapping_log,
    'essential_fields_available': f"{available_fields}/{len(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED)}"
}

print(f"\n📝 Column mapping summary saved to processing log")

# =============================================================================
# Column Mapping Verification and Validation
# =============================================================================

print("\n" + "="*70)
print("🔍 COLUMN MAPPING VERIFICATION")
print("="*70)

print("\n1. Before/After Column Comparison")
print("-" * 40)

# Show original vs mapped column names
print("Key column transformations:")
for original, mapped in COLUMN_MAPPING.items():
    if original in df_raw.columns:
        print(f"  {original:15} → {mapped}")

print(f"\nOriginal dataset columns ({len(df_raw.columns)}):")
for i, col in enumerate(df_raw.columns, 1):
    marker = " 🔄" if col in COLUMN_MAPPING.keys() else ""
    print(f"  {i:2d}. {col}{marker}")

print(f"\nStandardized dataset columns ({len(df_cleaned.columns)}):")
for i, col in enumerate(df_cleaned.columns, 1):
    marker = " ✨" if col in COLUMN_MAPPING.values() else ""
    print(f"  {i:2d}. {col}{marker}")

print("\n2. WHO GLASS Essential Fields Status")
print("-" * 40)

essential_status = {}
for field in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED:
    if field in df_cleaned.columns:
        completeness = (df_cleaned[field].notna().sum() / len(df_cleaned)) * 100
        essential_status[field] = completeness
        status = "✅" if completeness >= GLASS_QUALITY_THRESHOLDS['minimum_completeness'] else "⚠️"
        print(f"  {field:20} - {completeness:5.1f}% complete {status}")
    else:
        essential_status[field] = 0.0
        print(f"  {field:20} - Missing ❌")

overall_compliance = len([c for c in essential_status.values() if c >= GLASS_QUALITY_THRESHOLDS['minimum_completeness']])
print(f"\nOverall WHO GLASS Compliance: {overall_compliance}/{len(essential_status)} fields meet minimum thresholds")

print("\n3. Data Sample After Mapping")
print("-" * 40)

# Show a sample of the mapped data
print("Sample of standardized data (first 3 rows, essential fields only):")
sample_cols = [col for col in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED if col in df_cleaned.columns]
sample_data = df_cleaned[sample_cols].head(3)

for i, (idx, row) in enumerate(sample_data.iterrows()):
    print(f"\nRecord {i+1}:")
    for col in sample_cols:
        value = row[col] if pd.notna(row[col]) else "N/A"
        print(f"  {col:20}: {value}")

print("\n" + "="*70)
print("✅ COLUMN MAPPING VERIFICATION COMPLETED")
print("="*70)




📂 LOADING RAW AMR DATA
📊 Loading data from: c:\NATIONAL AMR DATA ANALYSIS FILES\data\raw\AMR_DATA_FINAL.csv
✅ Data loaded successfully!
📊 Dataset shape: (36173, 45)
📋 Columns: ['ROW_IDX', 'COUNTRY_A', 'PATIENT_ID', 'SEX', 'AGE', 'INSTITUT', 'REGION', 'DEPARTMENT', 'SPEC_DATE', 'ORGANISM', 'ORG_TYPE', 'AMC_ND20', 'AMK_ND30', 'AMP_ND10', 'AMX_ND30', 'AZM_ND15', 'CAZ_ND30', 'CHL_ND30', 'CIP_ND5', 'CLI_ND2', 'CLO_ND5', 'CRO_ND30', 'CTX_ND30', 'CXM_ND30', 'ERY_ND15', 'ETP_ND10', 'FEP_ND30', 'FLC_ND', 'FOX_ND30', 'GEN_ND10', 'LEX_ND30', 'LIN_ND4', 'LNZ_ND30', 'LVX_ND5', 'MEM_ND10', 'MNO_ND30', 'OXA_ND1', 'PEN_ND10', 'PNV_ND10', 'RIF_ND5', 'SXT_ND1_2', 'TCY_ND30', 'TGC_ND15', 'TZP_ND100', 'VAN_ND30']

🔄 WHO GLASS COLUMN MAPPING & STANDARDIZATION

1. Column Mapping Analysis
----------------------------------------
Current columns requiring standardization:
  • INSTITUT → INSTITUTION
  • COUNTRY_A → COUNTRY
  • ORGANISM → ORGANISM_CODE

📊 Total columns requiring mapping: 3

2. Applying Column 


## WHO GLASS First Isolate Deduplication

This section implements the WHO GLASS standard for first isolate selection per patient, which is critical for accurate antimicrobial resistance surveillance by ensuring each patient contributes only one isolate per organism per time period.

### WHO GLASS Deduplication Rules:
1. **Patient-based grouping**: Group records by patient identifier
2. **Organism-specific**: Apply deduplication per organism type  
3. **Temporal logic**: Select first isolate per patient per organism
4. **Data integrity**: Preserve all AST results from the selected isolate

In [6]:
df_cleaned

Unnamed: 0,ROW_IDX,COUNTRY,PATIENT_ID,SEX,AGE,INSTITUTION,REGION,DEPARTMENT,SPEC_DATE,ORGANISM_CODE,ORG_TYPE,AMC_ND20,AMK_ND30,AMP_ND10,AMX_ND30,AZM_ND15,CAZ_ND30,CHL_ND30,CIP_ND5,CLI_ND2,CLO_ND5,CRO_ND30,CTX_ND30,CXM_ND30,ERY_ND15,ETP_ND10,FEP_ND30,FLC_ND,FOX_ND30,GEN_ND10,LEX_ND30,LIN_ND4,LNZ_ND30,LVX_ND5,MEM_ND10,MNO_ND30,OXA_ND1,PEN_ND10,PNV_ND10,RIF_ND5,SXT_ND1_2,TCY_ND30,TGC_ND15,TZP_ND100,VAN_ND30
0,501,GHA,_2917564954_,f,44.0,,,Out,01-Jan-20,eco,-,,S,R,,,,,,,,,S,R,,,,,,R,,,,,,,,,,,S,S,,,
1,184,GHA,10978,f,1.0,CCTH,Central Region,Out,01-Jan-22,ac-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,185,GHA,12981,m,0.0,CCTH,Central Region,Out,01-Jan-22,ac-,-,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,186,GHA,CC160,f,,CCTH,Central Region,Inp,01-Jan-23,ac-,-,,,,,,,,R,,,R,,,,,,,,,,,,,,,,,,,,,,,
4,247,GHA,_0123294111_,f,2.0,CCTH,Central Region,Inp,01-Jan-20,ci-,-,,R,R,,,,,R,,,,S,R,,,,,,R,,,,,,,,,,,S,S,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36168,31062,GHA,N536,m,1.0,TTH,Northern Region,Out,01-Jan-21,xxx,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36169,31063,GHA,N537,m,1.0,TTH,Northern Region,Out,01-Jan-21,xxx,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36170,31064,GHA,N538,f,20.0,TTH,Northern Region,Out,01-Jan-21,xxx,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
36171,31065,GHA,N539,m,33.0,TTH,Northern Region,Out,01-Jan-21,xxx,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [9]:


# =============================================================================
# WHO GLASS First Isolate Deduplication
# =============================================================================

print("\n" + "="*70)
print("🔄 WHO GLASS FIRST ISOLATE DEDUPLICATION")
print("="*70)

print("\n1. Pre-Deduplication Analysis")
print("-" * 40)

# Check current data status
print(f"📊 Records before deduplication: {len(df_cleaned):,}")
print(f"👥 Unique patients: {df_cleaned['PATIENT_ID'].nunique():,}")
print(f"🦠 Unique organisms: {df_cleaned['ORGANISM_CODE'].nunique():,}")

# Analyze patient-isolate distribution
patient_isolate_counts = df_cleaned.groupby('PATIENT_ID').size()
print(f"📈 Average isolates per patient: {patient_isolate_counts.mean():.1f}")
print(f"📊 Max isolates per patient: {patient_isolate_counts.max()}")

# Show patients with multiple isolates
multiple_isolates = patient_isolate_counts[patient_isolate_counts > 1]
print(f"👥 Patients with multiple isolates: {len(multiple_isolates):,} ({len(multiple_isolates)/len(patient_isolate_counts)*100:.1f}%)")

print("\n2. WHO GLASS Deduplication Parameters")
print("-" * 40)

# Define deduplication columns (patient + organism + optional time period)
dedup_columns = ['PATIENT_ID', 'ORGANISM_CODE']
sort_columns = ['PATIENT_ID', 'ORGANISM_CODE', 'SPEC_DATE']

print(f"🔑 Deduplication keys: {dedup_columns}")
print(f"📅 Sort order: {sort_columns}")
print(f"⚡ Strategy: Keep first isolate per patient per organism")

# Check for missing values in critical fields
missing_data = {}
for col in dedup_columns + ['SPEC_DATE']:
    missing_count = df_cleaned[col].isna().sum()
    missing_pct = (missing_count / len(df_cleaned)) * 100
    missing_data[col] = {'count': missing_count, 'percentage': missing_pct}
    
    if missing_count > 0:
        print(f"⚠️  {col}: {missing_count:,} missing ({missing_pct:.1f}%)")
    else:
        print(f"✅ {col}: No missing values")

print("\n3. Applying First Isolate Selection")
print("-" * 40)

# Convert SPEC_DATE to datetime for proper sorting
print("📅 Converting specimen dates...")
df_cleaned['SPEC_DATE'] = pd.to_datetime(df_cleaned['SPEC_DATE'], errors='coerce')

# Check for date conversion issues
invalid_dates = df_cleaned['SPEC_DATE'].isna().sum()
if invalid_dates > 0:
    print(f"⚠️  Warning: {invalid_dates} records have invalid dates")

# Sort data by patient, organism, and date (earliest first)
print("🔄 Sorting data by patient, organism, and date...")
df_sorted = df_cleaned.sort_values(sort_columns, na_position='last')

# Apply first isolate selection using groupby and first()
print("🎯 Selecting first isolate per patient per organism...")
df_first_isolate = df_sorted.groupby(dedup_columns, as_index=False).first()

# Calculate deduplication metrics
records_before = len(df_cleaned)
records_after = len(df_first_isolate)
records_removed = records_before - records_after
reduction_rate = (records_removed / records_before) * 100

print(f"✅ Deduplication completed!")
print(f"📊 Records before: {records_before:,}")
print(f"📊 Records after: {records_after:,}")
print(f"📉 Records removed: {records_removed:,}")
print(f"📊 Reduction rate: {reduction_rate:.1f}%")

print("\n4. Post-Deduplication Validation")
print("-" * 40)

# Validate deduplication results
print("🔍 Validating deduplication results...")

# Check for any remaining duplicates
remaining_duplicates = df_first_isolate.groupby(dedup_columns).size()
max_duplicates = remaining_duplicates.max()

if max_duplicates == 1:
    print("✅ Validation passed: No duplicate patient-organism combinations remain")
else:
    print(f"❌ Validation failed: Found {max_duplicates} max duplicates")

# Check data integrity
print("🔍 Checking data integrity...")
essential_fields_check = []
for field in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED:
    if field in df_first_isolate.columns:
        before_completeness = (df_cleaned[field].notna().sum() / len(df_cleaned)) * 100
        after_completeness = (df_first_isolate[field].notna().sum() / len(df_first_isolate)) * 100
        essential_fields_check.append({
            'field': field,
            'before': before_completeness,
            'after': after_completeness,
            'change': after_completeness - before_completeness
        })

print("Essential field completeness comparison:")
for check in essential_fields_check:
    change_indicator = "📈" if check['change'] > 0 else "📉" if check['change'] < 0 else "➡️"
    print(f"  {check['field']:20}: {check['before']:5.1f}% → {check['after']:5.1f}% {change_indicator}")

print("\n5. Deduplication Summary")
print("-" * 40)

# Create summary statistics
unique_patients_before = df_cleaned['PATIENT_ID'].nunique()
unique_patients_after = df_first_isolate['PATIENT_ID'].nunique()
unique_organisms_before = df_cleaned['ORGANISM_CODE'].nunique()
unique_organisms_after = df_first_isolate['ORGANISM_CODE'].nunique()

print(f"👥 Unique patients: {unique_patients_before:,} → {unique_patients_after:,}")
print(f"🦠 Unique organisms: {unique_organisms_before:,} → {unique_organisms_after:,}")

# Analyze the removed records
if records_removed > 0:
    print(f"📊 Duplicate analysis:")
    duplicate_analysis = df_cleaned.groupby(dedup_columns).size().value_counts().sort_index()
    for isolate_count, patient_count in duplicate_analysis.items():
        if isolate_count > 1:
            total_duplicates = patient_count * (isolate_count - 1)
            print(f"  • {patient_count:,} patient-organism pairs had {isolate_count} isolates each (removed {total_duplicates:,} duplicates)")

# Date range analysis
date_range_before = df_cleaned['SPEC_DATE'].max() - df_cleaned['SPEC_DATE'].min()
date_range_after = df_first_isolate['SPEC_DATE'].max() - df_first_isolate['SPEC_DATE'].min()
print(f"📅 Date range preserved: {date_range_before.days} → {date_range_after.days} days")

print("\n" + "="*70)
print("✅ WHO GLASS FIRST ISOLATE DEDUPLICATION COMPLETED")
print("="*70)

# Update the main dataframe
df_cleaned = df_first_isolate.copy()
print(f"📊 Updated working dataset: {len(df_cleaned):,} records")

# =============================================================================
# Deduplication Results Analysis and Verification
# =============================================================================

print("\n" + "="*70)
print("📊 DEDUPLICATION RESULTS ANALYSIS")
print("="*70)

print("\n1. Sample of Deduplicated Data")
print("-" * 40)

# Show sample of final deduplicated data
sample_patients = df_cleaned['PATIENT_ID'].unique()[:5]
print("Sample records from first 5 patients:")

for i, patient_id in enumerate(sample_patients, 1):
    patient_data = df_cleaned[df_cleaned['PATIENT_ID'] == patient_id]
    print(f"\nPatient {i} (ID: {patient_id}):")
    print(f"  • Records: {len(patient_data)}")
    print(f"  • Organisms: {', '.join(patient_data['ORGANISM_CODE'].unique())}")
    print(f"  • Date: {patient_data['SPEC_DATE'].dt.date.iloc[0] if not patient_data['SPEC_DATE'].isna().iloc[0] else 'N/A'}")

print("\n2. Organism Distribution Analysis")
print("-" * 40)

# Analyze organism distribution before and after
organism_counts_before = df_raw.groupby('ORGANISM').size().sort_values(ascending=False)
organism_counts_after = df_cleaned.groupby('ORGANISM_CODE').size().sort_values(ascending=False)

print("Top 10 organisms (before → after deduplication):")
top_organisms = organism_counts_after.head(10)

for i, (organism, count_after) in enumerate(top_organisms.items(), 1):
    count_before = organism_counts_before.get(organism, 0)
    reduction = count_before - count_after if count_before > 0 else 0
    reduction_pct = (reduction / count_before * 100) if count_before > 0 else 0
    
    print(f"  {i:2d}. {organism:15} {count_before:,} → {count_after:,} ({reduction_pct:4.1f}% reduction)")

print("\n3. Patient-Level Impact Assessment")
print("-" * 40)

# Analyze impact on different patient categories
patient_impact = []

# Group patients by their original isolate count
original_patient_counts = df_raw.groupby('PATIENT_ID').size()
final_patient_counts = df_cleaned.groupby('PATIENT_ID').size()

for isolate_count in sorted(original_patient_counts.unique()):
    patients_with_count = (original_patient_counts == isolate_count).sum()
    if isolate_count == 1:
        retained = patients_with_count  # All single-isolate patients retained
        impact = "No change"
    else:
        retained = (final_patient_counts <= isolate_count).sum()
        impact = f"Reduced to 1-{isolate_count} isolates"
    
    patient_impact.append({
        'original_isolates': isolate_count,
        'patient_count': patients_with_count,
        'impact': impact
    })

print("Patient impact by original isolate count:")
for impact in patient_impact[:10]:  # Show first 10 categories
    print(f"  • {impact['patient_count']:,} patients with {impact['original_isolates']} isolates → {impact['impact']}")

print("\n4. Quality Metrics Summary")
print("-" * 40)

# Calculate final quality metrics
quality_metrics = {
    'total_records': len(df_cleaned),
    'unique_patients': df_cleaned['PATIENT_ID'].nunique(),
    'unique_organisms': df_cleaned['ORGANISM_CODE'].nunique(),
    'date_range_days': (df_cleaned['SPEC_DATE'].max() - df_cleaned['SPEC_DATE'].min()).days,
    'avg_isolates_per_patient': len(df_cleaned) / df_cleaned['PATIENT_ID'].nunique(),
    'deduplication_rate': reduction_rate
}

print("📊 Final Quality Metrics:")
for metric, value in quality_metrics.items():
    if isinstance(value, float):
        print(f"  • {metric.replace('_', ' ').title()}: {value:.2f}")
    else:
        print(f"  • {metric.replace('_', ' ').title()}: {value:,}")

print("\n5. WHO GLASS Compliance Check")
print("-" * 40)

# Final WHO GLASS compliance validation
compliance_checks = {
    'first_isolate_rule': max_duplicates == 1,
    'patient_id_present': df_cleaned['PATIENT_ID'].notna().all(),
    'organism_code_present': df_cleaned['ORGANISM_CODE'].notna().all(),
    'specimen_date_present': df_cleaned['SPEC_DATE'].notna().sum() / len(df_cleaned) > 0.8,
    'minimum_records': len(df_cleaned) >= GLASS_QUALITY_THRESHOLDS['minimum_isolates']
}

print("WHO GLASS Compliance Status:")
for check, passed in compliance_checks.items():
    status = "✅ PASS" if passed else "❌ FAIL"
    print(f"  • {check.replace('_', ' ').title()}: {status}")

overall_compliance = sum(compliance_checks.values()) / len(compliance_checks) * 100
print(f"\n🎯 Overall WHO GLASS Compliance: {overall_compliance:.1f}%")

if overall_compliance >= 90:
    print("✅ Dataset meets WHO GLASS quality standards!")
elif overall_compliance >= 70:
    print("⚠️  Dataset partially meets WHO GLASS standards - review required")
else:
    print("❌ Dataset does not meet WHO GLASS standards - significant issues detected")

print("\n" + "="*70)
print("✅ DEDUPLICATION ANALYSIS COMPLETED")
print("="*70)


🔄 WHO GLASS FIRST ISOLATE DEDUPLICATION

1. Pre-Deduplication Analysis
----------------------------------------
📊 Records before deduplication: 36,173
👥 Unique patients: 30,081
🦠 Unique organisms: 76
📈 Average isolates per patient: 1.2
📊 Max isolates per patient: 3
👥 Patients with multiple isolates: 5,744 (19.1%)

2. WHO GLASS Deduplication Parameters
----------------------------------------
🔑 Deduplication keys: ['PATIENT_ID', 'ORGANISM_CODE']
📅 Sort order: ['PATIENT_ID', 'ORGANISM_CODE', 'SPEC_DATE']
⚡ Strategy: Keep first isolate per patient per organism
⚠️  PATIENT_ID: 6 missing (0.0%)
✅ ORGANISM_CODE: No missing values
✅ SPEC_DATE: No missing values

3. Applying First Isolate Selection
----------------------------------------
📅 Converting specimen dates...
🔄 Sorting data by patient, organism, and date...
🎯 Selecting first isolate per patient per organism...
✅ Deduplication completed!
📊 Records before: 36,173
📊 Records after: 32,688
📉 Records removed: 3,485
📊 Reduction rate: 9.6%


Unnamed: 0,PATIENT_ID,ORGANISM_CODE,ROW_IDX,COUNTRY,SEX,AGE,INSTITUTION,REGION,DEPARTMENT,SPEC_DATE,ORG_TYPE,AMC_ND20,AMK_ND30,AMP_ND10,AMX_ND30,AZM_ND15,CAZ_ND30,CHL_ND30,CIP_ND5,CLI_ND2,CLO_ND5,CRO_ND30,CTX_ND30,CXM_ND30,ERY_ND15,ETP_ND10,FEP_ND30,FLC_ND,FOX_ND30,GEN_ND10,LEX_ND30,LIN_ND4,LNZ_ND30,LVX_ND5,MEM_ND10,MNO_ND30,OXA_ND1,PEN_ND10,PNV_ND10,RIF_ND5,SXT_ND1_2,TCY_ND30,TGC_ND15,TZP_ND100,VAN_ND30
0,0101,xxx,18097,GHA,f,1.0,ERH,Eastern Region,Out,2023-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,0105/23,xxx,18098,GHA,m,41.0,ERH,Eastern Region,Out,2023-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,0106,xxx,18099,GHA,m,,ERH,Eastern Region,Inp,2023-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,0107,xxx,18100,GHA,m,,ERH,Eastern Region,Inp,2023-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,0112/23,xxx,18101,GHA,f,29.0,ERH,Eastern Region,Out,2023-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32683,t95,xxx,31159,GHA,f,7.0,HTH,Volta Region,Out,2021-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32684,t96,xxx,31160,GHA,m,66.0,HTH,Volta Region,Out,2021-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32685,t97,xxx,31161,GHA,f,0.0,HTH,Volta Region,Out,2021-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
32686,t98,xxx,31162,GHA,f,0.0,HTH,Volta Region,Out,2021-01-01,o,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [13]:
# Quick inspection of cleaned dataset columns
print("Cleaned dataset columns:")
print(df_cleaned.columns.tolist())
print(f"\nDataset shape: {df_cleaned.shape}")
print(f"Sample of key columns:")
key_cols = ['PATIENT_ID', 'SPEC_DATE', 'WHONET_ORG_CODE']
for col in key_cols:
    if col in df_cleaned.columns:
        print(f"  {col}: {df_cleaned[col].notna().sum()}/{len(df_cleaned)} values")
    else:
        print(f"  {col}: NOT FOUND")

# Check for country and institution columns
country_cols = [col for col in df_cleaned.columns if 'country' in col.lower() or 'pays' in col.lower()]
institution_cols = [col for col in df_cleaned.columns if 'institut' in col.lower()]
print(f"\nCountry-related columns: {country_cols}")
print(f"Institution-related columns: {institution_cols}")

Cleaned dataset columns:
['PATIENT_ID', 'ORGANISM_CODE', 'ROW_IDX', 'COUNTRY', 'SEX', 'AGE', 'INSTITUTION', 'REGION', 'DEPARTMENT', 'SPEC_DATE', 'ORG_TYPE', 'AMC_ND20', 'AMK_ND30', 'AMP_ND10', 'AMX_ND30', 'AZM_ND15', 'CAZ_ND30', 'CHL_ND30', 'CIP_ND5', 'CLI_ND2', 'CLO_ND5', 'CRO_ND30', 'CTX_ND30', 'CXM_ND30', 'ERY_ND15', 'ETP_ND10', 'FEP_ND30', 'FLC_ND', 'FOX_ND30', 'GEN_ND10', 'LEX_ND30', 'LIN_ND4', 'LNZ_ND30', 'LVX_ND5', 'MEM_ND10', 'MNO_ND30', 'OXA_ND1', 'PEN_ND10', 'PNV_ND10', 'RIF_ND5', 'SXT_ND1_2', 'TCY_ND30', 'TGC_ND15', 'TZP_ND100', 'VAN_ND30']

Dataset shape: (32688, 45)
Sample of key columns:
  PATIENT_ID: 32688/32688 values
  SPEC_DATE: 32688/32688 values
  WHONET_ORG_CODE: NOT FOUND

Country-related columns: ['COUNTRY']
Institution-related columns: ['INSTITUTION']


In [14]:

# =============================================================================
# Comprehensive Data Quality Report Generation
# 
# This section generates a comprehensive data quality report that summarizes 
# all data cleaning operations, quality metrics, and WHO GLASS compliance 
# assessments. The report will be exported in multiple formats for 
# documentation and audit purposes.
#
# Report Components:
# 1. Dataset Overview: Basic statistics and data characteristics
# 2. Data Cleaning Summary: Step-by-step processing results
# 3. WHO GLASS Compliance: Essential fields and quality thresholds
# 4. Quality Metrics: Completeness, consistency, and validity measures
# 5. Export Documentation: Timestamped audit trail
# =============================================================================

print("\n" + "="*70)
print("📊 GENERATING COMPREHENSIVE DATA QUALITY REPORT")
print("="*70)

# Prepare report timestamp
report_timestamp = datetime.now()
report_date = report_timestamp.strftime("%Y-%m-%d")
report_time = report_timestamp.strftime("%H:%M:%S")

print(f"📅 Report Generation Date: {report_date}")
print(f"⏰ Report Generation Time: {report_time}")

print("\n1. Dataset Overview Compilation")
print("-" * 40)

# Calculate comprehensive dataset statistics
original_records = len(df_raw)
final_records = len(df_cleaned)
total_reduction = original_records - final_records
reduction_rate = (total_reduction / original_records) * 100

dataset_overview = {
    "total_records": final_records,
    "total_patients": df_cleaned['PATIENT_ID'].nunique(),
    "date_range": {
        "start": df_cleaned['SPEC_DATE'].min().strftime("%Y-%m-%d %H:%M:%S"),
        "end": df_cleaned['SPEC_DATE'].max().strftime("%Y-%m-%d %H:%M:%S"),
        "span_days": (df_cleaned['SPEC_DATE'].max() - df_cleaned['SPEC_DATE'].min()).days
    },
    "countries": df_cleaned['COUNTRY'].unique().tolist(),
    "institutions": df_cleaned['INSTITUTION'].nunique()
}

print(f"✅ Dataset overview compiled: {final_records:,} records, {dataset_overview['total_patients']:,} patients")

print("\n2. Data Cleaning Summary Compilation")
print("-" * 40)

# Compile data cleaning steps summary
data_cleaning_summary = {
    "initial_raw_records": original_records,
    "final_clean_records": final_records,
    "total_records_removed": total_reduction,
    "total_reduction_rate": f"{reduction_rate:.2f}%",
    "cleaning_steps": {
        "step_1_column_mapping": {
            "records_before": original_records,
            "records_after": original_records,  # Column mapping doesn't remove records
            "records_removed": 0,
            "reduction_rate": "0.00%",
            "description": "Column mapping and standardization"
        },
        "step_2_deduplication": {
            "records_before": original_records,
            "records_after": final_records,
            "records_removed": total_reduction,
            "reduction_rate": f"{reduction_rate:.2f}%",
            "description": "First isolate per patient filtering"
        }
    },
    "data_quality_improvements": {
        "duplicate_records_removed": total_reduction,
        "patient_organism_pairs_deduplicated": True,
        "column_names_standardized": len([k for k in COLUMN_MAPPING.keys() if k in df_raw.columns]),
        "who_glass_compliance_applied": True
    }
}

print(f"✅ Data cleaning summary compiled: {total_reduction:,} records removed ({reduction_rate:.2f}% reduction)")

print("\n3. WHO GLASS Compliance Assessment")
print("-" * 40)

# Assess WHO GLASS compliance
essential_field_completeness = {}
for field in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED:
    if field in df_cleaned.columns:
        completeness = (df_cleaned[field].notna().sum() / len(df_cleaned)) * 100
        essential_field_completeness[field] = f"{completeness:.1f}%"
    else:
        essential_field_completeness[field] = "0.0% (Missing)"

# Calculate overall compliance metrics
compliance_scores = {
    "essential_fields_present": len([f for f in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED if f in df_cleaned.columns]),
    "total_essential_fields": len(GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED),
    "minimum_completeness_met": all(
        (df_cleaned[f].notna().sum() / len(df_cleaned)) * 100 >= GLASS_QUALITY_THRESHOLDS['minimum_completeness']
        for f in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED if f in df_cleaned.columns
    ),
    "temporal_coverage_months": (df_cleaned['SPEC_DATE'].max() - df_cleaned['SPEC_DATE'].min()).days / 30.44,
    "minimum_isolates_met": len(df_cleaned) >= GLASS_QUALITY_THRESHOLDS['minimum_isolates']
}

who_glass_compliance = {
    "essential_fields_compliance": f"{compliance_scores['essential_fields_present']}/{compliance_scores['total_essential_fields']} fields present",
    "data_completeness_compliance": "PASS" if compliance_scores['minimum_completeness_met'] else "REVIEW REQUIRED",
    "temporal_coverage_compliance": f"{compliance_scores['temporal_coverage_months']:.1f} months",
    "sample_size_compliance": "PASS" if compliance_scores['minimum_isolates_met'] else "INSUFFICIENT",
    "first_isolate_rule_applied": True,
    "deduplication_performed": True
}

print(f"✅ WHO GLASS compliance assessed: {compliance_scores['essential_fields_present']}/{compliance_scores['total_essential_fields']} essential fields present")

print("\n4. Quality Metrics Calculation")
print("-" * 40)

# Calculate comprehensive quality metrics
ast_columns = [col for col in df_cleaned.columns if any(ast in col for ast in ['_ND', 'AST_'])]
ast_completeness = []

for col in ast_columns[:10]:  # Sample first 10 AST columns
    completeness = (df_cleaned[col].notna().sum() / len(df_cleaned)) * 100
    ast_completeness.append(completeness)

avg_ast_completeness = np.mean(ast_completeness) if ast_completeness else 0

quality_metrics = {
    "essential_field_completeness": essential_field_completeness,
    "ast_columns_available": len(ast_columns),
    "average_ast_completeness": f"{avg_ast_completeness:.1f}%",
    "data_consistency_checks": {
        "patient_id_format_consistent": df_cleaned['PATIENT_ID'].dtype == 'object',
        "date_format_standardized": df_cleaned['SPEC_DATE'].dtype == 'datetime64[ns]',
        "organism_codes_valid": df_cleaned['ORGANISM_CODE'].notna().sum() > 0
    },
    "missing_data_summary": {
        field: f"{(df_cleaned[field].isna().sum() / len(df_cleaned)) * 100:.1f}%"
        for field in GLASS_ESSENTIAL_FIELDS_MAPPED_UPDATED if field in df_cleaned.columns
    }
}

print(f"✅ Quality metrics calculated: {len(ast_columns)} AST columns, {avg_ast_completeness:.1f}% avg completeness")

print("\n5. Organism and Antimicrobial Analysis")
print("-" * 40)

# Analyze organism distribution
organism_counts = df_cleaned['ORGANISM_CODE'].value_counts()
top_organisms = organism_counts.head(10).to_dict()

# Department distribution
department_distribution = df_cleaned['DEPARTMENT'].value_counts().to_dict()

# Geographic distribution
country_distribution = df_cleaned['COUNTRY'].value_counts().to_dict()
institution_distribution = df_cleaned['INSTITUTION'].value_counts().to_dict()

organism_analysis = {
    "total_unique_organisms": len(organism_counts),
    "top_10_organisms": top_organisms,
    "department_distribution": department_distribution,
    "geographic_distribution": {
        "countries": country_distribution,
        "institutions": len(institution_distribution),
        "institution_coverage": f"{len(institution_distribution)} facilities"
    }
}

print(f"✅ Organism analysis completed: {len(organism_counts)} unique organisms identified")

print("\n6. Compiling Final Report")
print("-" * 40)

# Compile comprehensive quality report
comprehensive_quality_report = {
    "report_metadata": {
        "generation_date": report_date,
        "generation_time": report_time,
        "report_version": "1.0",
        "data_source": "AMR_DATA_FINAL.csv",
        "processing_pipeline": "WHO GLASS Compliant Data Cleaning",
        "analyst": "Automated Data Processing System"
    },
    "dataset_overview": dataset_overview,
    "data_cleaning_summary": data_cleaning_summary,
    "who_glass_compliance": who_glass_compliance,
    "quality_metrics": quality_metrics,
    "organism_analysis": organism_analysis,
    "recommendations": {
        "data_quality_grade": "HIGH" if avg_ast_completeness > 80 else "MODERATE" if avg_ast_completeness > 60 else "REQUIRES_IMPROVEMENT",
        "who_glass_compliance_status": "COMPLIANT" if compliance_scores['minimum_completeness_met'] else "REVIEW_REQUIRED",
        "next_steps": [
            "Regular quality monitoring",
            "Periodic WHO GLASS compliance review",
            "Continuous data validation"
        ]
    }
}

print("✅ Comprehensive quality report compiled successfully")

print("\n7. Report Export")
print("-" * 40)

# Create export directory
export_dir = PROCESSED_DATA_PATH / "quality_reports"
export_dir.mkdir(exist_ok=True)

# Export as JSON
json_filename = f"comprehensive_quality_report_{report_date}.json"
json_path = export_dir / json_filename

try:
    with open(json_path, 'w', encoding='utf-8') as f:
        json.dump(comprehensive_quality_report, f, indent=4, ensure_ascii=False, default=str)
    print(f"✅ JSON report exported: {json_path}")
except Exception as e:
    print(f"❌ Error exporting JSON report: {e}")

# Export summary as CSV
try:
    summary_data = []
    
    # Basic metrics
    summary_data.append(["Metric", "Value", "Category"])
    summary_data.append(["Total Records", final_records, "Dataset Overview"])
    summary_data.append(["Total Patients", dataset_overview['total_patients'], "Dataset Overview"])
    summary_data.append(["Date Range (Days)", dataset_overview['date_range']['span_days'], "Dataset Overview"])
    summary_data.append(["Records Removed", total_reduction, "Data Cleaning"])
    summary_data.append(["Reduction Rate (%)", f"{reduction_rate:.2f}", "Data Cleaning"])
    summary_data.append(["WHO GLASS Compliance", who_glass_compliance['data_completeness_compliance'], "Quality"])
    summary_data.append(["AST Completeness (%)", f"{avg_ast_completeness:.1f}", "Quality"])
    
    # Essential field completeness
    for field, completeness in essential_field_completeness.items():
        summary_data.append([f"{field} Completeness", completeness, "Essential Fields"])
    
    summary_df = pd.DataFrame(summary_data[1:], columns=summary_data[0])
    csv_filename = f"quality_report_summary_{report_date}.csv"
    csv_path = export_dir / csv_filename
    summary_df.to_csv(csv_path, index=False)
    print(f"✅ CSV summary exported: {csv_path}")
    
except Exception as e:
    print(f"❌ Error exporting CSV summary: {e}")

# Export detailed text report
try:
    txt_filename = f"quality_report_detailed_{report_date}.txt"
    txt_path = export_dir / txt_filename
    
    with open(txt_path, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("WHO GLASS AMR DATA QUALITY REPORT\n")
        f.write("="*80 + "\n")
        f.write(f"Report Generated: {report_date} {report_time}\n")
        f.write(f"Data Source: AMR_DATA_FINAL.csv\n")
        f.write(f"Processing Pipeline: WHO GLASS Compliant Data Cleaning\n\n")
        
        f.write("EXECUTIVE SUMMARY\n")
        f.write("-"*80 + "\n")
        f.write(f"• Total Records Processed: {final_records:,}\n")
        f.write(f"• Unique Patients: {dataset_overview['total_patients']:,}\n")
        f.write(f"• Data Reduction Rate: {reduction_rate:.2f}%\n")
        f.write(f"• WHO GLASS Compliance: {who_glass_compliance['data_completeness_compliance']}\n")
        f.write(f"• Average AST Completeness: {avg_ast_completeness:.1f}%\n\n")
        
        f.write("ESSENTIAL FIELDS COMPLETENESS\n")
        f.write("-"*80 + "\n")
        for field, completeness in essential_field_completeness.items():
            f.write(f"• {field}: {completeness}\n")
        
        f.write(f"\nTOP 5 ORGANISMS\n")
        f.write("-"*80 + "\n")
        for i, (organism, count) in enumerate(list(top_organisms.items())[:5], 1):
            f.write(f"{i}. {organism}: {count:,} isolates\n")
        
        f.write(f"\nDATA QUALITY RECOMMENDATIONS\n")
        f.write("-"*80 + "\n")
        for rec in comprehensive_quality_report['recommendations']['next_steps']:
            f.write(f"• {rec}\n")
    
    print(f"✅ Detailed text report exported: {txt_path}")
    
except Exception as e:
    print(f"❌ Error exporting text report: {e}")

print(f"\n📁 All reports exported to: {export_dir}")
print(f"📊 Report files generated:")
print(f"   • {json_filename} (Comprehensive JSON)")
print(f"   • {csv_filename} (Summary CSV)")
print(f"   • {txt_filename} (Detailed Text)")

print("\n" + "="*70)
print("✅ COMPREHENSIVE DATA QUALITY REPORT GENERATION COMPLETED")
print("="*70)


📊 GENERATING COMPREHENSIVE DATA QUALITY REPORT
📅 Report Generation Date: 2025-06-11
⏰ Report Generation Time: 15:38:17

1. Dataset Overview Compilation
----------------------------------------
✅ Dataset overview compiled: 32,688 records, 30,081 patients

2. Data Cleaning Summary Compilation
----------------------------------------
✅ Data cleaning summary compiled: 3,485 records removed (9.63% reduction)

3. WHO GLASS Compliance Assessment
----------------------------------------
✅ WHO GLASS compliance assessed: 8/8 essential fields present

4. Quality Metrics Calculation
----------------------------------------
✅ Quality metrics calculated: 34 AST columns, 3.3% avg completeness

5. Organism and Antimicrobial Analysis
----------------------------------------
✅ Organism analysis completed: 76 unique organisms identified

6. Compiling Final Report
----------------------------------------
✅ Comprehensive quality report compiled successfully

7. Report Export
------------------------------

In [15]:
# Export the cleaned and deduplicated dataset
print("\n" + "="*70)
print("📤 EXPORTING CLEANED AND DEDUPLICATED DATASET")
print("="*70)

# Create the export directory
export_path = Path(r"C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\deduplicated")
export_path.mkdir(parents=True, exist_ok=True)

# Generate timestamped filename
export_filename = f"df_cleaned_{report_date}.csv"
export_filepath = export_path / export_filename

try:
    # Export the cleaned dataframe
    print(f"📊 Exporting cleaned dataset to: {export_filepath}")
    df_cleaned.to_csv(export_filepath, index=False)
    
    # Verify the export
    exported_size = export_filepath.stat().st_size / (1024 * 1024)  # Size in MB
    
    print(f"✅ Export completed successfully!")
    print(f"📁 File location: {export_filepath}")
    print(f"📊 Records exported: {len(df_cleaned):,}")
    print(f"📦 File size: {exported_size:.2f} MB")
    print(f"📋 Columns exported: {len(df_cleaned.columns)}")
    
    # Create export metadata
    export_metadata = {
        "export_timestamp": datetime.now().isoformat(),
        "original_file": "AMR_DATA_FINAL.csv",
        "export_file": export_filename,
        "records_exported": len(df_cleaned),
        "columns_exported": len(df_cleaned.columns),
        "file_size_mb": round(exported_size, 2),
        "data_processing_summary": {
            "column_mapping_applied": True,
            "deduplication_applied": True,
            "who_glass_compliance": True,
            "records_removed": total_reduction,
            "reduction_rate_percent": round(reduction_rate, 2)
        },
        "column_list": df_cleaned.columns.tolist()
    }
    
    # Save metadata file
    metadata_filename = f"export_metadata_{report_date}.json"
    metadata_filepath = export_path / metadata_filename
    
    with open(metadata_filepath, 'w', encoding='utf-8') as f:
        json.dump(export_metadata, f, indent=4, ensure_ascii=False)
    
    print(f"📝 Export metadata saved: {metadata_filepath}")
    
except Exception as e:
    print(f"❌ Error during export: {e}")
    raise

print("\n" + "="*70)
print("✅ DATASET EXPORT COMPLETED")
print("="*70)


📤 EXPORTING CLEANED AND DEDUPLICATED DATASET
📊 Exporting cleaned dataset to: C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\deduplicated\df_cleaned_2025-06-11.csv
✅ Export completed successfully!
📁 File location: C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\deduplicated\df_cleaned_2025-06-11.csv
📊 Records exported: 32,688
📦 File size: 3.14 MB
📋 Columns exported: 45
📝 Export metadata saved: C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\deduplicated\export_metadata_2025-06-11.json

✅ DATASET EXPORT COMPLETED


In [None]:
#