# SRTR Data Linkage

This notebook links SRTR actual donor data to CLIF potential donors for a specified site.
It identifies which CLIF-defined potential donors became actual organ donors.

## 1. Setup and Imports

In [None]:
import json
import logging
import sys
from pathlib import Path
from datetime import datetime
import polars as pl
import pandas as pd

# Add parent directory to path for imports
sys.path.append(str(Path.cwd().parent))
from utils.io import read_data

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)


from utils.config import config
site_name = config['site_name']
tables_path = config['tables_path']
file_type = config['file_type']
project_root = config['project_root']
SRTR_data_path = config["SRTR_data_path"]
sys.path.insert(0, project_root)
print(f"Site Name: {site_name}")
print(f"Tables Path: {tables_path}")
print(f"File Type: {file_type}")
from pathlib import Path
PROJECT_ROOT = Path(config['project_root'])
UTILS_DIR = PROJECT_ROOT / "utils"
OUTPUT_DIR = PROJECT_ROOT / "output"
OUTPUT_FINAL_DIR = OUTPUT_DIR / "final"
OUTPUT_INTERMEDIATE_DIR = OUTPUT_DIR / "intermediate"


# Data

In [None]:
import pandas as pd

donor_xlsx_path = "../utils/unos_donors_clif.csv"
donor_df = pd.read_csv(donor_xlsx_path)

def decode_bytes_in_object(df):
    """
    Decodes byte-string values in object columns to normal strings (utf-8).
    """
    for col in df.select_dtypes(include=['object']).columns:
        try:
            # Only decode if the whole column looks like bytes
            if df[col].apply(lambda x: isinstance(x, bytes)).any():
                df[col] = df[col].apply(lambda x: x.decode('utf-8', errors='replace') if isinstance(x, bytes) else x)
        except Exception as e:
            print(f"Decoding error in column {col}: {e}")
    return df

# Try ISO-8859-1 (latin1) encoding—this is more permissive than utf-8 and can handle many common SAS7BDAT byte values
donor_deceased_filepath = SRTR_data_path + "/" +  "donor_deceased.sas7bdat"
donor_deceased = pd.read_sas(
    donor_deceased_filepath,
    format='sas7bdat',
    encoding='latin1'   # <- Changed from 'utf-8'
)
donor_deceased = decode_bytes_in_object(donor_deceased)

institution = SRTR_data_path + "/" + "institution.sas7bdat"
institution = pd.read_sas(
    '/Users/kavenchhikara/Library/CloudStorage/Box-Box/SAF Q2 2025/pubsaf2506/institution.sas7bdat',
    format='sas7bdat',
    encoding='latin1'   # <- Changed from 'utf-8'
)
institution = decode_bytes_in_object(institution)

In [None]:
# Identify provider numbers for the current site from the hospital provider mapping
import json

hospital_mapping_path = "../config/hospital_provider_mapping.json"
with open(hospital_mapping_path, "r") as f:
    hospital_mapping = json.load(f)

# site_name should be defined globally (e.g., "ucmc", "upenn", etc.)
site_providers = hospital_mapping[site_name]["provider_numbers"]
# Ensure all provider numbers are handled as strings for robust matching
site_providers_str = set(str(p) for p in site_providers)

# Filter donor_df rows with provider numbers for this site
matching_donor_rows = donor_df[
    donor_df["DON_HOSP_PROVIDER_NUM"].astype(str).isin(site_providers_str)
]
# Extract all relevant DONOR_IDs for this site
donor_ids_for_site = set(matching_donor_rows["DONOR_ID"].astype("int32"))

print(f"Site '{site_name}' provider numbers: {site_providers}")
print(f"Found {len(donor_ids_for_site)} donor IDs for site '{site_name}'")

# Filter donor_deceased for DONOR_IDs in donor_ids_for_site
donor_deceased_site = donor_deceased[
    donor_deceased["DONOR_ID"].astype("int32").isin(donor_ids_for_site)
].copy()

print(f"Filtered donor_deceased to {len(donor_deceased_site)} records for site '{site_name}'")

In [None]:
# Keep only the specified columns in donor_deceased_site
donor_deceased_site_filtered = donor_deceased_site[
    [
        "DONOR_ID",
        "DON_OPO_CTR_ID",
        "PERS_ID",
        "DON_AGE",
        "DON_GENDER",
        "DON_RACE",
        "DON_RACE_SRTR",
        "DON_ETHNICITY_SRTR",
        "DON_HGT_CM",
        "DON_WGT_KG",
        "DON_RECOV_DT",
        "DON_DCD_SUPPORT_WITHDRAW_DT",
        "DON_DCD_AGONAL_BEGIN_DT",
        "DON_CAD_DON_COD",
        "DON_DEATH_MECH", 
        "DON_DEATH_CIRCUM",
        "DON_CREAT",
        "DON_BUN",
        "DON_TOT_BILI"
    ]
].copy()

# Save the filtered DataFrame to OUTPUT_INTERMEDIATE_DIR as a CSV
import os

output_path = os.path.join(OUTPUT_INTERMEDIATE_DIR, f"donor_deceased_site_filtered_{site_name}.csv")
donor_deceased_site_filtered.to_csv(output_path, index=False)
print(f"Saved donor_deceased_site_filtered to {output_path}")

In [None]:
import matplotlib.pyplot as plt

# Convert DON_RECOV_DT to datetime if not already
donor_deceased_site_filtered['DON_RECOV_DT'] = pd.to_datetime(donor_deceased_site_filtered['DON_RECOV_DT'], errors='coerce')

plt.figure(figsize=(10,6))
plt.hist(donor_deceased_site_filtered['DON_RECOV_DT'].dropna(), bins=30, color='skyblue', edgecolor='black')
plt.xlabel('DON_RECOV_DT')
plt.ylabel('Frequency')
plt.title('Histogram of DON_RECOV_DT')
plt.tight_layout()
plt.show()

In [None]:
# Filter donor_deceased_site_filtered to include only rows where DON_RECOV_DT is between 2018 and 2024
donor_deceased_site_filtered = donor_deceased_site_filtered[
    (donor_deceased_site_filtered['DON_RECOV_DT'] >= '2018-01-01') &
    (donor_deceased_site_filtered['DON_RECOV_DT'] < '2024-01-01')
].copy()
print(f"Filtered donor_deceased_site_filtered to dates between 2018 and 2024. New shape: {donor_deceased_site_filtered.shape}")

# Print number of unique donors in the filtered dataframe
num_unique_donors = donor_deceased_site_filtered['DONOR_ID'].nunique()
print(f"Number of unique donors in filtered data: {num_unique_donors}")


# Link with CLIF data for potential donors

In [None]:
import pandas as pd
import os
# Read the Parquet file 'final_cohort_df.parquet' from OUTPUT_INTERMEDIATE_DIR
final_cohort_path = os.path.join(OUTPUT_INTERMEDIATE_DIR, "final_cohort_df.parquet")
final_cohort_df = pd.read_parquet(final_cohort_path)
print(f"Read final_cohort_df from {final_cohort_path} with shape {final_cohort_df.shape}")


In [None]:
final_cohort_df.columns

In [None]:
import pandas as pd
import numpy as np

# Utility to report variable stats in consistent format
def print_var_summary(df, col, name, is_numeric=True, fmt="{:.1f}", extra=None):
    print(f"{name}:")
    print(f"  Data type: {df[col].dtype}")
    missing = df[col].isna().sum()
    miss_pct = df[col].isna().mean() * 100
    print(f"  Missing values: {missing} ({miss_pct:.1f}%)")
    if is_numeric and df[col].notna().any():
        _min = df[col].min()
        _max = df[col].max()
        _mean = df[col].mean()
        _std = df[col].std()
        print(f"  Range: {fmt.format(_min)} to {fmt.format(_max)}")
        print(f"  Mean (SD): {fmt.format(_mean)} ({fmt.format(_std)})")
        if extra == "quantiles":
            qtls = df[col].quantile([0.25, 0.5, 0.75]).values
            print(f"  Quartiles: {np.round(qtls, 2)}")
    elif not is_numeric:
        uniques = df[col].unique()
        print(f"  Unique values: {uniques}")
        print(f"  Value counts:")
        print(df[col].value_counts())
    print()

def analyze_matching_variables(clif_df, srtr_df):
    """Analyze and compare key variables between CLIF and SRTR dataframes."""

    def sep(title, section=None):
        print("="*80)
        if title: print(title.upper())
        if section: print("="*80 + f"\n{section}")

    sep("Dataframe overview")
    print(f"CLIF Records: {len(clif_df):,}")
    print(f"SRTR Records: {len(srtr_df):,}\n")

    # 1. DATE VARIABLES
    sep(None, "1. DATE VARIABLES")
    print_var_summary(clif_df, 'final_death_dttm', "CLIF - final_death_dttm", is_numeric=False)
    print_var_summary(srtr_df, 'DON_RECOV_DT', "SRTR - DON_RECOV_DT", is_numeric=False)
    # Special handling for DCD if present
    if 'DON_DCD_SUPPORT_WITHDRAW_DT' in srtr_df.columns:
        print("SRTR - DON_DCD_SUPPORT_WITHDRAW_DT (DCD only):")
        col = 'DON_DCD_SUPPORT_WITHDRAW_DT'
        print(f"  Non-null values: {srtr_df[col].notna().sum()}")
        print(f"  Date range: {srtr_df[col].min()} to {srtr_df[col].max()}\n")

    # 2. AGE
    sep(None, "2. AGE")
    print_var_summary(clif_df, 'age_at_death', "CLIF - age_at_death", is_numeric=True, fmt="{:.1f}", extra="quantiles")
    print_var_summary(srtr_df, 'DON_AGE', "SRTR - DON_AGE", is_numeric=True, fmt="{:.1f}", extra="quantiles")

    # 3. SEX/GENDER
    sep(None, "3. SEX/GENDER")
    print_var_summary(clif_df, 'sex_category', "CLIF - sex_category", is_numeric=False)
    print_var_summary(srtr_df, 'DON_GENDER', "SRTR - DON_GENDER", is_numeric=False)

    # 4. RACE
    sep(None, "4. RACE")
    print("CLIF - race_category:")
    print(f"  Data type: {clif_df['race_category'].dtype}")
    uniq = clif_df['race_category'].nunique()
    print(f"  Unique values ({uniq} total):")
    print(clif_df['race_category'].value_counts().head(10))
    print()
    print("SRTR - DON_RACE_SRTR:")
    print(f"  Data type: {srtr_df['DON_RACE_SRTR'].dtype}")
    uniq_srtr = srtr_df['DON_RACE_SRTR'].nunique()
    print(f"  Unique values ({uniq_srtr} total):")
    print(srtr_df['DON_RACE_SRTR'].value_counts().head(10))
    print()

    # 5. ETHNICITY
    sep(None, "5. ETHNICITY")
    print("CLIF - ethnicity_category:")
    print(f"  Data type: {clif_df['ethnicity_category'].dtype}")
    print("  Unique values:")
    print(clif_df['ethnicity_category'].value_counts())
    print()
    print("SRTR - DON_ETHNICITY_SRTR:")
    print(f"  Data type: {srtr_df['DON_ETHNICITY_SRTR'].dtype}")
    print("  Unique values:")
    print(srtr_df['DON_ETHNICITY_SRTR'].value_counts())
    print()

    # 6. HEIGHT
    sep(None, "6. HEIGHT (cm)")
    print_var_summary(clif_df, 'last_height_cm', "CLIF - last_height_cm", is_numeric=True, fmt="{:.1f}")
    print_var_summary(srtr_df, 'DON_HGT_CM', "SRTR - DON_HGT_CM", is_numeric=True, fmt="{:.1f}")

    # 7. WEIGHT
    sep(None, "7. WEIGHT (kg)")
    print_var_summary(clif_df, 'last_weight_kg', "CLIF - last_weight_kg", is_numeric=True, fmt="{:.1f}")
    print_var_summary(srtr_df, 'DON_WGT_KG', "SRTR - DON_WGT_KG", is_numeric=True, fmt="{:.1f}")

    # 8. CREATININE
    sep(None, "8. CREATININE")
    print_var_summary(clif_df, 'creatinine_value', "CLIF - creatinine_value", is_numeric=True, fmt="{:.2f}")
    print_var_summary(srtr_df, 'DON_CREAT', "SRTR - DON_CREAT", is_numeric=True, fmt="{:.2f}")

    # 9. BILIRUBIN
    sep(None, "9. BILIRUBIN")
    print_var_summary(clif_df, 'bilirubin_total_value', "CLIF - bilirubin_total_value", is_numeric=True, fmt="{:.2f}")
    print_var_summary(srtr_df, 'DON_TOT_BILI', "SRTR - DON_TOT_BILI", is_numeric=True, fmt="{:.2f}")

# Run the analysis
analyze_matching_variables(final_cohort_df, donor_deceased_site_filtered)

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

def standardize_dataframes_for_matching(clif_df, srtr_df):
    """
    Standardize CLIF and SRTR dataframes for matching
    """

    # Create copies to avoid modifying originals
    clif_std = clif_df.copy()
    srtr_std = srtr_df.copy()

    print("Standardizing dataframes for matching...")
    print("=" * 80)

    # ----------------------------------------
    # 1. STANDARDIZE DATES
    # ----------------------------------------
    print("1. Standardizing dates...")

    # Convert CLIF death date to date only (no time) for matching
    clif_std['death_date'] = pd.to_datetime(clif_std['final_death_dttm']).dt.date

    # Convert SRTR recovery date to date only
    srtr_std['recovery_date'] = pd.to_datetime(srtr_std['DON_RECOV_DT']).dt.date

    # For DCD donors, we might want to use withdraw date as closer to death
    if 'DON_DCD_SUPPORT_WITHDRAW_DT' in srtr_std.columns:
        srtr_std['dcd_withdraw_date'] = pd.to_datetime(srtr_std['DON_DCD_SUPPORT_WITHDRAW_DT']).dt.date
        # Use withdraw date if available, otherwise recovery date
        srtr_std['match_date'] = srtr_std['dcd_withdraw_date'].fillna(srtr_std['recovery_date'])
    else:
        srtr_std['match_date'] = srtr_std['recovery_date']

    print(f"  CLIF death dates: {clif_std['death_date'].min()} to {clif_std['death_date'].max()}")
    print(f"  SRTR match dates: {srtr_std['match_date'].min()} to {srtr_std['match_date'].max()}")

    # ----------------------------------------
    # 2. STANDARDIZE SEX/GENDER
    # ----------------------------------------
    print("\n2. Standardizing sex/gender...")

    # Map CLIF sex to M/F
    sex_map_clif = {'Male': 'M', 'Female': 'F'}
    clif_std['sex_std'] = clif_std['sex_category'].map(sex_map_clif)

    # SRTR already uses M/F
    srtr_std['sex_std'] = srtr_std['DON_GENDER']

    print(f"  CLIF sex values: {clif_std['sex_std'].value_counts().to_dict()}")
    print(f"  SRTR sex values: {srtr_std['sex_std'].value_counts().to_dict()}")

    # ----------------------------------------
    # 3. STANDARDIZE RACE
    # ----------------------------------------
    print("\n3. Standardizing race...")

    # Create race mapping
    race_map_clif = {
        'Black or African American': 'BLACK',
        'White': 'WHITE',
        'Asian': 'ASIAN',
        'American Indian or Alaska Native': 'NATIVE',
        'Native Hawaiian or Other Pacific Islander': 'PACIFIC',
        'Other': 'OTHER',
        'Unknown': 'UNKNOWN'
    }

    clif_std['race_std'] = clif_std['race_category'].map(race_map_clif).fillna('UNKNOWN')
    srtr_std['race_std'] = srtr_std['DON_RACE_SRTR'].fillna('UNKNOWN')

    print("  CLIF race categories:")
    print(clif_std['race_std'].value_counts().head())
    print("  SRTR race categories:")
    print(srtr_std['race_std'].value_counts().head())

    # ----------------------------------------
    # 4. STANDARDIZE ETHNICITY
    # ----------------------------------------
    print("\n4. Standardizing ethnicity...")

    ethnicity_map_clif = {
        'Hispanic': 'HISPANIC',
        'Non-Hispanic': 'NON-HISPANIC',
        'Unknown': 'UNKNOWN'
    }

    ethnicity_map_srtr = {
        'LATINO': 'HISPANIC',
        'NLATIN': 'NON-HISPANIC'
    }

    clif_std['ethnicity_std'] = clif_std['ethnicity_category'].map(ethnicity_map_clif).fillna('UNKNOWN')
    srtr_std['ethnicity_std'] = srtr_std['DON_ETHNICITY_SRTR'].map(ethnicity_map_srtr).fillna('UNKNOWN')

    print(f"  CLIF ethnicity: {clif_std['ethnicity_std'].value_counts().to_dict()}")
    print(f"  SRTR ethnicity: {srtr_std['ethnicity_std'].value_counts().to_dict()}")

    # ----------------------------------------
    # 5. STANDARDIZE AGE
    # ----------------------------------------
    print("\n5. Standardizing age...")

    clif_std['age_std'] = clif_std['age_at_death'].round().astype('Int64')
    srtr_std['age_std'] = srtr_std['DON_AGE'].round().astype('Int64')

    print(
        f"  CLIF age: mean={clif_std['age_std'].mean():.1f}, "
        f"range={clif_std['age_std'].min()}-{clif_std['age_std'].max()}"
    )
    print(
        f"  SRTR age: mean={srtr_std['age_std'].mean():.1f}, "
        f"range={srtr_std['age_std'].min()}-{srtr_std['age_std'].max()}"
    )

    # ----------------------------------------
    # 6. STANDARDIZE CLINICAL VALUES
    # ----------------------------------------
    print("\n6. Standardizing clinical values...")

    # Height (cm) - round to integers
    clif_std['height_std'] = clif_std['last_height_cm'].round().astype('Int64')
    srtr_std['height_std'] = srtr_std['DON_HGT_CM'].round().astype('Int64')

    # Weight (kg) - round to 1 decimal
    clif_std['weight_std'] = clif_std['last_weight_kg'].round(1)
    srtr_std['weight_std'] = srtr_std['DON_WGT_KG'].round(1)

    # Creatinine - round to 2 decimals
    clif_std['creatinine_std'] = clif_std['creatinine_value'].round(2)
    srtr_std['creatinine_std'] = srtr_std['DON_CREAT'].round(2)

    # Bilirubin - round to 2 decimals
    clif_std['bilirubin_std'] = clif_std['bilirubin_total_value'].round(2)
    srtr_std['bilirubin_std'] = srtr_std['DON_TOT_BILI'].round(2)

    print(f"  Height missing - CLIF: {clif_std['height_std'].isna().sum()}, SRTR: {srtr_std['height_std'].isna().sum()}")
    print(f"  Weight missing - CLIF: {clif_std['weight_std'].isna().sum()}, SRTR: {srtr_std['weight_std'].isna().sum()}")
    print(f"  Creatinine missing - CLIF: {clif_std['creatinine_std'].isna().sum()}, SRTR: {srtr_std['creatinine_std'].isna().sum()}")
    print(f"  Bilirubin missing - CLIF: {clif_std['bilirubin_std'].isna().sum()}, SRTR: {srtr_std['bilirubin_std'].isna().sum()}")

    # ----------------------------------------
    # 7. ADD IDENTIFIERS
    # ----------------------------------------
    print("\n7. Adding standardized identifiers...")

    clif_std['clif_id'] = clif_std['patient_id']
    srtr_std['srtr_donor_id'] = srtr_std['DONOR_ID']

    print("=" * 80)
    print("Standardization complete!")

    # Return standardized columns
    clif_columns = [
        'clif_id', 'death_date', 'age_std', 'sex_std', 'race_std',
        'ethnicity_std', 'height_std', 'weight_std', 'creatinine_std', 'bilirubin_std'
    ]
    srtr_columns = [
        'srtr_donor_id', 'match_date', 'age_std', 'sex_std', 'race_std',
        'ethnicity_std', 'height_std', 'weight_std', 'creatinine_std',
        'bilirubin_std', 'DON_DEATH_MECH', 'DON_CAD_DON_COD'
    ]

    return clif_std[clif_columns], srtr_std[srtr_columns]

# Standardize the dataframes
clif_standardized, srtr_standardized = standardize_dataframes_for_matching(
    final_cohort_df,
    donor_deceased_site_filtered
)

# Display summaries
print("\nCLIF Standardized (first 5 rows):")
print(clif_standardized.head())
print(f"\nShape: {clif_standardized.shape}")

print("\nSRTR Standardized (first 5 rows):")
print(srtr_standardized.head())
print(f"\nShape: {srtr_standardized.shape}")

# Now, let's create a matching function that accounts for the small number of SRTR records and the date range differences:

def perform_matching(clif_std, srtr_std, date_window_days=7, age_tolerance=2):
    """
    Perform matching between CLIF and SRTR standardized dataframes

    Parameters
    ----------
    clif_std : DataFrame
        Standardized CLIF data
    srtr_std : DataFrame
        Standardized SRTR data
    date_window_days : int
        Number of days before/after death to consider a match
    age_tolerance : int
        Age difference tolerance in years
    """

    matches = []

    print(f"Matching {len(clif_std)} CLIF records against {len(srtr_std)} SRTR records...")
    print(f"Date window: ±{date_window_days} days")
    print(f"Age tolerance: ±{age_tolerance} years")
    print("=" * 80)

    for _, srtr_row in srtr_std.iterrows():
        # Filter CLIF records by basic demographics
        potential_matches = clif_std[
            (clif_std['sex_std'] == srtr_row['sex_std']) &
            (abs(clif_std['age_std'] - srtr_row['age_std']) <= age_tolerance)
        ].copy()

        if len(potential_matches) == 0:
            continue

        # Calculate date difference
        potential_matches['date_diff'] = (
            potential_matches['death_date'] - srtr_row['match_date']
        ).apply(lambda x: x.days if pd.notna(x) else 999)

        # Filter by date window
        potential_matches = potential_matches[
            abs(potential_matches['date_diff']) <= date_window_days
        ]

        if len(potential_matches) == 0:
            continue

        # Calculate match score
        for _, clif_row in potential_matches.iterrows():
            match_score = 0
            match_details = []

            # Exact matches get higher scores
            if clif_row['race_std'] == srtr_row['race_std']:
                match_score += 10
                match_details.append('race')

            if clif_row['ethnicity_std'] == srtr_row['ethnicity_std']:
                match_score += 5
                match_details.append('ethnicity')

            # Clinical value matches (if not missing)
            if pd.notna(clif_row['height_std']) and pd.notna(srtr_row['height_std']):
                if abs(clif_row['height_std'] - srtr_row['height_std']) <= 5:
                    match_score += 3
                    match_details.append('height')

            if pd.notna(clif_row['weight_std']) and pd.notna(srtr_row['weight_std']):
                if abs(clif_row['weight_std'] - srtr_row['weight_std']) <= 5:
                    match_score += 3
                    match_details.append('weight')

            if pd.notna(clif_row['creatinine_std']) and pd.notna(srtr_row['creatinine_std']):
                if abs(clif_row['creatinine_std'] - srtr_row['creatinine_std']) <= 0.5:
                    match_score += 2
                    match_details.append('creatinine')

            # Store match
            matches.append({
                'clif_id': clif_row['clif_id'],
                'srtr_donor_id': srtr_row['srtr_donor_id'],
                'date_diff_days': clif_row['date_diff'],
                'age_diff': abs(clif_row['age_std'] - srtr_row['age_std']),
                'match_score': match_score,
                'match_details': ', '.join(match_details),
                'clif_death_date': clif_row['death_date'],
                'srtr_match_date': srtr_row['match_date']
            })

    # Convert to DataFrame
    matches_df = pd.DataFrame(matches)

    if len(matches_df) > 0:
        # Sort by match score
        matches_df = matches_df.sort_values('match_score', ascending=False)

        # Remove duplicates (keep best match for each CLIF patient)
        matches_df = matches_df.drop_duplicates(subset=['clif_id'], keep='first')

        print(f"\nFound {len(matches_df)} potential matches")
        print("Match score distribution:")
        print(matches_df['match_score'].value_counts().sort_index(ascending=False))
    else:
        print("No matches found!")

    return matches_df

# Perform matching
matches = perform_matching(clif_standardized, srtr_standardized, date_window_days=7, age_tolerance=2)

if len(matches) > 0:
    print("\nTop 10 matches by score:")
    print(matches.head(10)[['clif_id', 'srtr_donor_id', 'date_diff_days', 'match_score', 'match_details']])

In [None]:
import numpy as np
import pandas as pd

date_window_days = 7
age_tolerance = 1

print(f"Matching {len(clif_standardized)} CLIF records against {len(srtr_standardized)} SRTR records...")
print(f"Date window: ±{date_window_days} days")
print(f"Age tolerance: ±{age_tolerance} years")
print("=" * 80)

# Copy data
clif_std = clif_standardized.copy()
srtr_std = srtr_standardized.copy()

# Do cartesian join
clif_std['_tmpkey'] = 1
srtr_std['_tmpkey'] = 1
merged = pd.merge(clif_std, srtr_std, on='_tmpkey', suffixes=('_clif', '_srtr')).drop('_tmpkey', axis=1)

# Ensure dates are datetime
merged['death_date'] = pd.to_datetime(merged['death_date'], errors='coerce')
merged['match_date'] = pd.to_datetime(merged['match_date'], errors='coerce')

# Calculate absolute date difference
merged['date_diff_days'] = (merged['death_date'] - merged['match_date']).dt.days

# Filter for:
# - death date window
# - age tolerance
# - exact race match
# - exact ethnicity match
mask = (
    merged['date_diff_days'].notnull() &
    (np.abs(merged['date_diff_days']) <= date_window_days) &
    (np.abs(merged['age_std_clif'] - merged['age_std_srtr']) <= age_tolerance) &
    (merged['race_std_clif'] == merged['race_std_srtr']) &
    (merged['ethnicity_std_clif'] == merged['ethnicity_std_srtr'])
)

matches = merged[mask].copy()

if len(matches) == 0:
    print("No matches found with exact match criteria (death date window, age, race, ethnicity)")
else:
    matches['age_diff'] = np.abs(matches['age_std_clif'] - matches['age_std_srtr'])
    matches = (matches
        .loc[:, [
            'clif_id', 'srtr_donor_id', 'date_diff_days', 'age_diff',
            'death_date', 'match_date', 'race_std_clif', 'ethnicity_std_clif'
        ]]
        .rename(columns={
            'death_date': 'clif_death_date',
            'match_date': 'srtr_match_date',
            'race_std_clif': 'race',
            'ethnicity_std_clif': 'ethnicity'
        })
    )
    print(f"\nFound {len(matches)} exact matches.")
    print("\nTop 10 matches:")
    print(matches.head(10)[[
        'clif_id', 'srtr_donor_id', 'date_diff_days', 'age_diff', 'race', 'ethnicity'
    ]])

In [None]:
# ============================================
# 1. APPEND MATCH RESULTS TO FINAL_COHORT_DF
# ============================================

# Copy the final cohort dataframe for enhancement
final_cohort_df_enhanced = final_cohort_df.copy()

# Generate a set of matched patient IDs
matched_patient_ids = set(matches['clif_id'].unique())

# Add a boolean flag indicating actual donors
final_cohort_df_enhanced['actual_donor'] = final_cohort_df_enhanced['patient_id'].isin(matched_patient_ids)

# Prepare the matches dataframe for merging by renaming 'clif_id' to 'patient_id'
matches_for_merge = matches[['clif_id', 'srtr_donor_id', 'date_diff_days']].rename(
    columns={'clif_id': 'patient_id'}
)

# Merge SRTR donor information into final_cohort_df_enhanced
final_cohort_df_enhanced = final_cohort_df_enhanced.merge(
    matches_for_merge,
    on='patient_id',
    how='left'
)

# Print summary statistics in a standardized format
print("=" * 80)
print("ENHANCED COHORT SUMMARY")
print("=" * 80)
print(f"Total patients: {len(final_cohort_df_enhanced):,}")
print(f"Actual donors (matched to SRTR): {final_cohort_df_enhanced['actual_donor'].sum():,}")
print(f"CALC eligible: {final_cohort_df_enhanced['calc_flag'].sum():,}")
print(f"CLIF eligible: {final_cohort_df_enhanced['clif_eligible_donors'].sum():,}")
print(
    f"Either CALC or CLIF eligible: "
    f"{((final_cohort_df_enhanced['calc_flag']) | (final_cohort_df_enhanced['clif_eligible_donors'])).sum():,}"
)


In [None]:
# ============================================
# CONCENTRIC CIRCLES WITH ACTUAL DONORS
# ============================================

import matplotlib.pyplot as plt
from matplotlib.patches import Circle
import numpy as np

def create_concentric_circles_with_actual_donors(final_cohort_df_enhanced, output_path=None):
    """
    Create side-by-side concentric circle diagrams showing potential → actual donors.
    Uses the same format as cohort_visualizations.py but adds actual donors.

    Parameters
    ----------
    final_cohort_df_enhanced : pandas.DataFrame
        Enhanced dataframe with 'actual_donor' flag.
    output_path : str, optional
        Path to save the generated figure.

    Returns
    -------
    matplotlib.figure.Figure
        The generated matplotlib Figure object.
    """
    # Setup figure and axes
    fig, axes = plt.subplots(1, 2, figsize=(16, 8))

    # Numbers for both definitions
    calc_potential = final_cohort_df_enhanced['calc_flag'].sum()
    calc_actual = final_cohort_df_enhanced.loc[final_cohort_df_enhanced['calc_flag'], 'actual_donor'].sum()

    clif_potential = final_cohort_df_enhanced['clif_eligible_donors'].sum()
    clif_actual = final_cohort_df_enhanced.loc[final_cohort_df_enhanced['clif_eligible_donors'], 'actual_donor'].sum()

    details = [
        {
            "ax": axes[0],
            "definition": "CALC",
            "subplot_label": "(A)",
            "potential_n": calc_potential,
            "actual_n": calc_actual
        },
        {
            "ax": axes[1],
            "definition": "CLIF",
            "subplot_label": "(B)",
            "potential_n": clif_potential,
            "actual_n": clif_actual
        }
    ]

    for d in details:
        ax = d["ax"]
        definition = d["definition"]
        subplot_label = d["subplot_label"]
        potential_n = d["potential_n"]
        actual_n = d["actual_n"]

        ax.set_xlim(-1.5, 1.5)
        ax.set_ylim(-1.5, 1.5)
        ax.set_aspect('equal')
        ax.axis('off')

        if definition == "CALC":
            stage1_n = len(final_cohort_df_enhanced)
            stage2_n = final_cohort_df_enhanced['age_75_less'].sum()
            stage3_n = final_cohort_df_enhanced.loc[
                final_cohort_df_enhanced['age_75_less'] &
                (final_cohort_df_enhanced['icd10_ischemic'] |
                 final_cohort_df_enhanced['icd10_cerebro'] |
                 final_cohort_df_enhanced['icd10_external'])
            ].shape[0]
            stage4_n = potential_n
            steps = [
                {'n': stage1_n, 'label': 'All inpatient deaths', 'stage': 1},
                {'n': stage2_n, 'label': 'Age ≤75', 'stage': 2},
                {'n': stage3_n, 'label': 'Cause', 'stage': 3},
                {'n': stage4_n, 'label': 'No contraindications', 'stage': 4},
                {'n': actual_n, 'label': 'Actual donors', 'stage': 5}
            ]
            colors_map = {
                1: ('#D3D3D3', 'none'),
                2: ('#000000', 'none'),
                3: ('#2196F3', 'none'),
                4: ('#ADD8E6', '#ADD8E6'),
                5: ('#4CAF50', '#4CAF50')
            }
        else:
            stage1_n = len(final_cohort_df_enhanced)
            stage2_n = final_cohort_df_enhanced['age_75_less'].sum()
            stage3_n = final_cohort_df_enhanced.loc[
                final_cohort_df_enhanced['age_75_less'] &
                final_cohort_df_enhanced['imv_48hr_expire']
            ].shape[0]
            stage4_n = final_cohort_df_enhanced.loc[
                final_cohort_df_enhanced['age_75_less'] &
                final_cohort_df_enhanced['imv_48hr_expire'] &
                final_cohort_df_enhanced['no_positive_culture_48hrs'] &
                (~final_cohort_df_enhanced['icd10_contraindication'])
            ].shape[0]
            stage5_n = potential_n
            steps = [
                {'n': stage1_n, 'label': 'All inpatient deaths', 'stage': 1},
                {'n': stage2_n, 'label': 'Age ≤75', 'stage': 2},
                {'n': stage3_n, 'label': 'IMV within 48h', 'stage': 3},
                {'n': stage4_n, 'label': 'No contraindications', 'stage': 4},
                {'n': stage5_n, 'label': 'Pass organ quality', 'stage': 5},
                {'n': actual_n, 'label': 'Actual donors', 'stage': 6}
            ]
            colors_map = {
                1: ('#D3D3D3', 'none'),
                2: ('#000000', 'none'),
                3: ('#9C27B0', 'none'),
                4: ('#F44336', 'none'),
                5: ('#ADD8E6', '#ADD8E6'),
                6: ('#4CAF50', '#4CAF50')
            }

        initial_n = steps[0]['n']
        max_radius = 1.0  # always = sqrt(initial_n / initial_n) = 1

        base_center_x = -0.3
        center_y = 0

        for i in range(len(steps)-1, -1, -1):
            step = steps[i]
            stage_num = step['stage']
            curr_n = step['n']
            radius = 1.0 * np.sqrt(curr_n / initial_n) if curr_n > 0 else 0
            edge_color, face_color = colors_map.get(stage_num, ('#808080', 'none'))
            indent_amount = 0.6 * (1 - radius / max_radius) if max_radius > 0 else 0
            center_x = base_center_x + indent_amount

            if definition == 'CALC':
                fill = stage_num >= 4
            else:
                fill = stage_num >= 5

            alpha = 0.7 if fill else 1.0

            circle = Circle((center_x, center_y), radius,
                facecolor=face_color if fill else 'none',
                edgecolor=edge_color,
                linewidth=2.5,
                alpha=alpha,
                fill=fill
            )
            ax.add_patch(circle)

            # Annotate numbers
            if step == steps[-1]:  # Actual donors
                if actual_n > 0:
                    percentage = (actual_n / potential_n * 100) if potential_n > 0 else 0
                    ax.text(center_x, center_y,
                        f"{actual_n}\n({percentage:.1f}%)",
                        ha='center', va='center', fontsize=10, fontweight='bold', color='white'
                    )
            elif step == steps[-2]:  # Potential donors
                if actual_n > 0 and potential_n > 0:
                    potential_radius = radius
                    actual_radius = 1.0 * np.sqrt(actual_n / initial_n)
                    ring_radius = (potential_radius + actual_radius) / 2
                    text_y = center_y + ring_radius * 0.7
                else:
                    text_y = center_y
                ax.text(center_x, text_y,
                    f"{potential_n}",
                    ha='center', va='center', fontsize=11, fontweight='bold', color='#333'
                )

        title = f"{subplot_label} {definition} Definition"
        ax.text(base_center_x, 1.35, title, ha='center', va='center',
                fontsize=12, fontweight='bold')

        # Removed conversion rate from the definition title and below the title

    # LEGEND
    legend_labels = []
    legend_handles = []

    legend_entries = [
        ('All inpatient hospital deaths', '#D3D3D3'),
        ('Patients aged ≤75 at death', '#000000'),
        ('Cause consistent with donation (CALC)', '#2196F3'),
        ('IMV within 48hrs (CLIF)', '#9C27B0'),
        ('No contraindications', '#F44336'),
        ('Pass organ quality assessment (CLIF)', '#ADD8E6'),
    ]

    for label, color in legend_entries:
        if color == '#ADD8E6':
            legend_handles.append(plt.Line2D([0], [0], marker='o', color='w',
                                             markerfacecolor=color, markersize=10, linestyle='None'))
        else:
            legend_handles.append(plt.Line2D([0], [0], color=color, linewidth=3))
        legend_labels.append(label)

    # Actual donors
    legend_handles.append(plt.Line2D([0], [0], marker='o', color='w',
                                     markerfacecolor='#4CAF50', markersize=10, linestyle='None'))
    legend_labels.append('Actual donors (matched to SRTR)')

    fig.legend(legend_handles, legend_labels, loc='lower center', ncol=4, frameon=True, fontsize=8, bbox_to_anchor=(0.5, -0.02))

    plt.suptitle('Potential vs Actual Deceased Organ Donors', fontsize=16, fontweight='bold', y=0.98)
    plt.tight_layout()

    if output_path:
        fig.savefig(output_path, dpi=300, bbox_inches='tight')
        print(f"✓ Circles with actual donors saved to: {output_path}")

    return fig

# Generate the figure
fig = create_concentric_circles_with_actual_donors(
    final_cohort_df_enhanced,
    output_path='circles_potential_vs_actual.png'
)
plt.show()

# Print summary statistics
print("\n" + "=" * 80)
print("POTENTIAL VS ACTUAL DONORS SUMMARY")
print("=" * 80)

calc_potential = final_cohort_df_enhanced['calc_flag'].sum()
calc_actual = final_cohort_df_enhanced.loc[final_cohort_df_enhanced['calc_flag'], 'actual_donor'].sum()
calc_rate = (calc_actual / calc_potential * 100) if calc_potential > 0 else 0

clif_potential = final_cohort_df_enhanced['clif_eligible_donors'].sum()
clif_actual = final_cohort_df_enhanced.loc[final_cohort_df_enhanced['clif_eligible_donors'], 'actual_donor'].sum()
clif_rate = (clif_actual / clif_potential * 100) if clif_potential > 0 else 0

print(f"\nCALC Definition:")
print(f"  Potential donors: {calc_potential:,}")
print(f"  Actual donors: {calc_actual:,}")

print(f"\nCLIF Definition:")
print(f"  Potential donors: {clif_potential:,}")
print(f"  Actual donors: {clif_actual:,}")


either_potential = (final_cohort_df_enhanced['calc_flag'] | final_cohort_df_enhanced['clif_eligible_donors']).sum()
either_actual = final_cohort_df_enhanced.loc[
    (final_cohort_df_enhanced['calc_flag'] | final_cohort_df_enhanced['clif_eligible_donors']),
    'actual_donor'
].sum()
either_rate = (either_actual / either_potential * 100) if either_potential > 0 else 0

print(f"\nEither CALC or CLIF:")
print(f"  Potential donors: {either_potential:,}")
print(f"  Actual donors: {either_actual:,}")

In [None]:
# ============================================
# INVESTIGATE NON-ELIGIBLE ACTUAL DONORS
# ============================================

# Identify the actual donors who were NOT eligible by either criteria
actual_donors = final_cohort_df_enhanced[final_cohort_df_enhanced['actual_donor'] == True]
non_eligible_donors = actual_donors[
    (actual_donors['calc_flag'] == False) &
    (actual_donors['clif_eligible_donors'] == False)
]

print("=" * 80)
print("ANALYSIS: ACTUAL DONORS NOT CAPTURED BY CALC OR CLIF CRITERIA")
print("=" * 80)
print(f"\nTotal matched actual donors: {len(actual_donors)}")
print(f"Not eligible by either criteria: {len(non_eligible_donors)} "
      f"({len(non_eligible_donors)/len(actual_donors)*100:.1f}%)")

# Analyze why they weren't eligible
print("\n" + "=" * 80)
print("WHY WERE ACTUAL DONORS EXCLUDED?")
print("=" * 80)

# Check age criteria
over_75 = non_eligible_donors['age_75_less'] == False
n_over_75 = over_75.sum()
n_non_eligible = len(non_eligible_donors)
print(f"\n1. AGE > 75 years: {n_over_75} donors "
      f"({n_over_75/n_non_eligible*100:.1f}%)")
if n_over_75 > 0:
    ages = non_eligible_donors.loc[over_75, 'age_at_death']
    print(f"   Age range: {ages.min():.1f} - {ages.max():.1f}")
    print(f"   Mean age: {ages.mean():.1f} (SD: {ages.std():.1f})")

# For those ≤75, check other criteria
under_75_donors = non_eligible_donors[non_eligible_donors['age_75_less'] == True]
n_under_75 = len(under_75_donors)
print(f"\n2. For donors ≤75 years (n={n_under_75}):")

if n_under_75 > 0:
    # CALC criteria failures
    print("\n   CALC Criteria Failures:")

    # Check cause of death
    has_qualifying_cause = (
        (under_75_donors['icd10_ischemic'] == True) |
        (under_75_donors['icd10_cerebro'] == True) |
        (under_75_donors['icd10_external'] == True)
    )
    no_qualifying_cause = ~has_qualifying_cause
    print(f"   - No qualifying cause of death: {no_qualifying_cause.sum()}")

    # Check contraindications
    has_contraindication = under_75_donors['icd10_contraindication'] == True
    print(f"   - Has contraindication: {has_contraindication.sum()}")

    # CLIF criteria failures
    print("\n   CLIF Criteria Failures:")

    # Check IMV
    no_imv = under_75_donors['imv_48hr_expire'] == False
    print(f"   - No IMV within 48hrs: {no_imv.sum()}")

    # For those with IMV, check other criteria
    with_imv = under_75_donors[under_75_donors['imv_48hr_expire'] == True]
    if len(with_imv) > 0:
        print(f"   - With IMV but failed other criteria: {len(with_imv)}")
        positive_culture = with_imv['no_positive_culture_48hrs'] == False
        print(f"     • Positive culture: {positive_culture.sum()}")
        has_contra = with_imv['icd10_contraindication'] == True
        print(f"     • Has contraindication: {has_contra.sum()}")
        organ_fail = with_imv['organ_check_pass'] == False
        print(f"     • Failed organ quality: {organ_fail.sum()}")

# Create detailed breakdown table
print("\n" + "=" * 80)
print("DETAILED BREAKDOWN OF ALL ACTUAL DONORS")
print("=" * 80)

breakdown_data = {
    'Category': [
        'CALC eligible only',
        'CLIF eligible only',
        'Both CALC & CLIF eligible',
        'Neither (Age > 75)',
        'Neither (Age ≤ 75, other reasons)',
        'Total actual donors'
    ],
    'N': [
        len(actual_donors[(actual_donors['calc_flag'] == True) &
                          (actual_donors['clif_eligible_donors'] == False)]),
        len(actual_donors[(actual_donors['calc_flag'] == False) &
                          (actual_donors['clif_eligible_donors'] == True)]),
        len(actual_donors[(actual_donors['calc_flag'] == True) &
                          (actual_donors['clif_eligible_donors'] == True)]),
        len(non_eligible_donors[non_eligible_donors['age_75_less'] == False]),
        len(non_eligible_donors[non_eligible_donors['age_75_less'] == True]),
        len(actual_donors)
    ]
}

breakdown_df = pd.DataFrame(breakdown_data)
breakdown_df['Percentage'] = (breakdown_df['N'] / len(actual_donors) * 100).round(1)
print(breakdown_df.to_string(index=False))

