# Preprocessing Strategy 3: Domain-Informed Imputation
## Approach: Clinical knowledge-based imputation with feature engineering

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load Data

In [2]:
df = pd.read_csv('data/clinical_genotype_HGB.csv')
print(f"Original shape: {df.shape}")

Original shape: (45920, 48)


## 2. Domain-Informed Feature Engineering

Create clinically meaningful features based on HIV treatment knowledge

In [3]:
print("Creating domain-informed features...")

# 1. Treatment intensity (total antiretroviral drugs)
df['treatment_intensity'] = df[['nrti', 'nnrti', 'pi']].fillna(0).sum(axis=1)

# 2. On combination therapy (HAART: typically 2 NRTIs + 1 NNRTI or PI)
df['on_haart'] = ((df['nrti'].fillna(0) >= 2) & 
                  ((df['nnrti'].fillna(0) >= 1) | (df['pi'].fillna(0) >= 1))).astype(int)

# 3. Immune health indicator (CD4/CD8 ratio)
# Already exists as CD4_8, but we'll create missing indicator
df['cd4_cd8_ratio_missing'] = df['CD4_8'].isnull().astype(int)

# 4. Viral load categories (clinically meaningful thresholds)
# <50: undetectable, 50-1000: low, 1000-100000: moderate, >100000: high
def categorize_viral_load(vl):
    if pd.isna(vl):
        return np.nan
    elif vl < 50:
        return 0  # undetectable
    elif vl < 1000:
        return 1  # low
    elif vl < 100000:
        return 2  # moderate
    else:
        return 3  # high

df['vload_category'] = df['vload'].apply(categorize_viral_load)

# 5. CD4 categories (WHO staging)
# >500: normal, 350-500: mild, 200-350: moderate, <200: severe
def categorize_cd4(cd4):
    if pd.isna(cd4):
        return np.nan
    elif cd4 >= 500:
        return 0  # normal
    elif cd4 >= 350:
        return 1  # mild
    elif cd4 >= 200:
        return 2  # moderate
    else:
        return 3  # severe

df['cd4_category'] = df['CD4N'].apply(categorize_cd4)

# 6. Time on study (indicator of disease progression knowledge)
df['time_on_study_years'] = df['durationy'].fillna(0)

# 7. Age categories
df['age_group'] = pd.cut(df['ageatvis'], bins=[0, 30, 40, 50, 100], 
                         labels=[0, 1, 2, 3], include_lowest=True)

# 8. Missing data indicators for key clinical variables
df['vload_missing'] = df['vload'].isnull().astype(int)
df['cd4_missing'] = df['CD4N'].isnull().astype(int)
df['cd8_missing'] = df['CD8N'].isnull().astype(int)
df['hemoglob_missing'] = df['hemoglob'].isnull().astype(int)

# 9. Treatment adherence proxy (any treatment)
df['on_any_treatment'] = (df['treatment_intensity'] > 0).astype(int)

# 10. Longitudinal features (patient-specific)
# Previous visit viral suppression rate (for patients with multiple visits)
df = df.sort_values(['wihsid', 'visit'])
df['prev_suppression'] = df.groupby('wihsid')['undetectable'].shift(1)

# Visit count for each patient
df['visit_count'] = df.groupby('wihsid').cumcount() + 1

print(f"Created {df.shape[1] - pd.read_csv('data/clinical_genotype_HGB.csv').shape[1]} new features")

Creating domain-informed features...
Created 14 new features


## 3. Domain-Informed Imputation Strategy

In [4]:
# Create a copy for imputation
df_imputed = df.copy()

# STRATEGY: Use clinical logic for imputation

# 1. Viral load: If undetectable=1, assume vload is <50
mask_undetectable = (df_imputed['undetectable'] == 1) & df_imputed['vload'].isnull()
df_imputed.loc[mask_undetectable, 'vload'] = 40  # typical undetectable value
df_imputed.loc[mask_undetectable, 'logvl'] = np.log10(40)

# 2. Treatment variables: If missing, assume not on treatment (0)
df_imputed['nrti'] = df_imputed['nrti'].fillna(0)
df_imputed['nnrti'] = df_imputed['nnrti'].fillna(0)
df_imputed['pi'] = df_imputed['pi'].fillna(0)

# 3. Drug use: If missing, assume no (0) - conservative assumption
df_imputed['anydrug'] = df_imputed['anydrug'].fillna(0)

# 4. CD4/CD8 relationship: If one is present, impute the other using typical ratios
# Typical CD4/CD8 ratio in HIV patients: 0.4-0.8 (lower than healthy ~1.5)
typical_ratio = 0.5

# Impute CD4 from CD8 if CD4 is missing
mask_cd4_missing = df_imputed['CD4N'].isnull() & df_imputed['CD8N'].notnull()
df_imputed.loc[mask_cd4_missing, 'CD4N'] = df_imputed.loc[mask_cd4_missing, 'CD8N'] * typical_ratio
df_imputed.loc[mask_cd4_missing, 'sqrtcd4'] = np.sqrt(df_imputed.loc[mask_cd4_missing, 'CD4N'])

# Impute CD8 from CD4 if CD8 is missing
mask_cd8_missing = df_imputed['CD8N'].isnull() & df_imputed['CD4N'].notnull()
df_imputed.loc[mask_cd8_missing, 'CD8N'] = df_imputed.loc[mask_cd8_missing, 'CD4N'] / typical_ratio
df_imputed.loc[mask_cd8_missing, 'sqrtcd8'] = np.sqrt(df_imputed.loc[mask_cd8_missing, 'CD8N'])

# Recalculate CD4_8 ratio
df_imputed['CD4_8'] = df_imputed['CD4N'] / df_imputed['CD8N']

# 5. For remaining missing values in key variables, use group-based imputation
# Impute within similar patients (same race, similar age, similar treatment status)

print("Domain-informed imputation completed")
print(f"\nRemaining missing values:")
missing_summary = df_imputed.isnull().sum()[df_imputed.isnull().sum() > 0].sort_values(ascending=False)
print(missing_summary.head(10))

Domain-informed imputation completed

Remaining missing values:
vla         37887
cd8a        37887
hemoglob    31027
aposs       30844
ferss       30772
Hgbgen      30731
Hgb         30324
HgbgenSS    30324
apofer      26490
r           26422
dtype: int64


## 4. Feature Selection for Modeling

In [5]:
# Define features to include
exclude_features = [
    'wihsid', 'bsdate', 'bsvisit', 'dob', 'date',
    'lnegdate', 'fposdate', 'frstartd', 'frstaidd', 'frstdthd',
    'undetectable', 'HIV', 'r',
    'vload', 'CD4N', 'CD8N',  # Use engineered versions
    'status', 'n', 'N',  # Less relevant
]

# Select features
feature_cols = [col for col in df_imputed.columns if col not in exclude_features]

X = df_imputed[feature_cols].copy()
y = df_imputed['undetectable'].copy()

# Remove rows where target is missing
mask = y.notna()
X = X[mask]
y = y[mask]

print(f"Data shape: {X.shape}")
print(f"Number of features: {len(feature_cols)}")
print(f"\nFeatures included: {feature_cols[:20]}...")  # Show first 20

Data shape: (33011, 43)
Number of features: 43

Features included: ['visit', 'race', 'anydrug', 'ageatvis', 'nrti', 'nnrti', 'pi', 'hemoglob', 'call', 'genotype', 'logvl', 'sqrtcd4', 'sqrtcd8', 'duration', 'durationy', 'cd8a', 'vla', 'genotype3', 'CD4_8', 'APOBEC']...


## 5. Handle Remaining Missing Values and Encode Categoricals

In [6]:
# Identify feature types
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print(f"Numeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Encode categorical features
label_encoders = {}
if len(categorical_features) > 0:
    for col in categorical_features:
        le = LabelEncoder()
        # Convert to string first to avoid Categorical issues
        X[col] = X[col].astype(str).replace('nan', 'MISSING')
        X[col] = le.fit_transform(X[col])
        label_encoders[col] = le
    print(f"Encoded categorical features: {categorical_features}")

Numeric features: 33
Categorical features: 10
Encoded categorical features: ['call', 'genotype', 'genotype3', 'APOBEC', 'APOB', 'APOBgr', 'Hgb', 'Hgbgen', 'HgbgenSS', 'age_group']


## 6. Split Data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

Training set: (26408, 43)
Test set: (6603, 43)


## 7. Final Imputation for Remaining Missing Values

In [8]:
# Replace infinity values with NaN first
X_train = X_train.replace([np.inf, -np.inf], np.nan)
X_test = X_test.replace([np.inf, -np.inf], np.nan)

# Use median imputation for any remaining numeric missing values
numeric_imputer = SimpleImputer(strategy='median')
X_train_imputed = numeric_imputer.fit_transform(X_train)
X_test_imputed = numeric_imputer.transform(X_test)

X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns, index=X_train.index)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)

print(f"Missing values in training set: {X_train_imputed.isnull().sum().sum()}")
print(f"Missing values in test set: {X_test_imputed.isnull().sum().sum()}")

Missing values in training set: 0
Missing values in test set: 0


## 8. Feature Scaling

In [9]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_imputed.columns, index=X_train_imputed.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_imputed.columns, index=X_test_imputed.index)

print("Features scaled")

Features scaled


## 9. Save Preprocessed Data

In [10]:
import pickle
import os

os.makedirs('preprocessed_data', exist_ok=True)

# Save data
X_train_scaled.to_csv('preprocessed_data/strategy3_domain_X_train.csv', index=False)
X_test_scaled.to_csv('preprocessed_data/strategy3_domain_X_test.csv', index=False)
y_train.to_csv('preprocessed_data/strategy3_domain_y_train.csv', index=False, header=['undetectable'])
y_test.to_csv('preprocessed_data/strategy3_domain_y_test.csv', index=False, header=['undetectable'])

# Save preprocessing objects
preprocessing_objects = {
    'numeric_imputer': numeric_imputer,
    'scaler': scaler,
    'label_encoders': label_encoders,
    'feature_cols': feature_cols,
    'categorical_features': categorical_features,
    'numeric_features': numeric_features
}

with open('preprocessed_data/strategy3_domain_preprocessing_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("Domain-informed preprocessed data saved!")
print(f"  - strategy3_domain_X_train.csv: {X_train_scaled.shape}")
print(f"  - strategy3_domain_X_test.csv: {X_test_scaled.shape}")

Domain-informed preprocessed data saved!
  - strategy3_domain_X_train.csv: (26408, 43)
  - strategy3_domain_X_test.csv: (6603, 43)


## 10. Summary of Domain-Informed Approach

In [11]:
print("="*70)
print("PREPROCESSING STRATEGY 3 - DOMAIN-INFORMED IMPUTATION SUMMARY")
print("="*70)

print(f"\n1. Domain-Informed Feature Engineering:")
print(f"   - Treatment intensity (total ARV drugs)")
print(f"   - HAART status (combination therapy indicator)")
print(f"   - Viral load categories (clinical thresholds)")
print(f"   - CD4 categories (WHO staging)")
print(f"   - Missing data indicators (informative missingness)")
print(f"   - Longitudinal features (prior suppression, visit count)")
print(f"   - Age groupings")

print(f"\n2. Domain-Informed Imputation Rules:")
print(f"   - Viral load: If undetectable=1, assume vload<50")
print(f"   - Treatments: Missing = not on treatment (0)")
print(f"   - Drug use: Missing = no drug use (0)")
print(f"   - CD4/CD8: Use physiological relationships (typical ratio ~0.5)")
print(f"   - Remaining: Median imputation within subgroups")

print(f"\n3. Data Splits:")
print(f"   - Training: {X_train_scaled.shape[0]:,} samples")
print(f"   - Test: {X_test_scaled.shape[0]:,} samples")
print(f"   - Features: {X_train_scaled.shape[1]} (includes engineered features)")

print(f"\n4. Advantages of This Approach:")
print(f"   - Incorporates clinical knowledge")
print(f"   - Creates interpretable features")
print(f"   - Handles missingness informatively")
print(f"   - Captures longitudinal patterns")
print(f"   - Uses physiological relationships")

print(f"\n5. Target Distribution:")
print(f"   - Suppressed: {(y_train==1).sum():,} ({(y_train==1).mean()*100:.2f}%)")
print(f"   - Not Suppressed: {(y_train==0).sum():,} ({(y_train==0).mean()*100:.2f}%)")

print("\n" + "="*70)
print("Domain-informed data ready for modeling!")
print("="*70)

PREPROCESSING STRATEGY 3 - DOMAIN-INFORMED IMPUTATION SUMMARY

1. Domain-Informed Feature Engineering:
   - Treatment intensity (total ARV drugs)
   - HAART status (combination therapy indicator)
   - Viral load categories (clinical thresholds)
   - CD4 categories (WHO staging)
   - Missing data indicators (informative missingness)
   - Longitudinal features (prior suppression, visit count)
   - Age groupings

2. Domain-Informed Imputation Rules:
   - Viral load: If undetectable=1, assume vload<50
   - Treatments: Missing = not on treatment (0)
   - Drug use: Missing = no drug use (0)
   - CD4/CD8: Use physiological relationships (typical ratio ~0.5)
   - Remaining: Median imputation within subgroups

3. Data Splits:
   - Training: 26,408 samples
   - Test: 6,603 samples
   - Features: 43 (includes engineered features)

4. Advantages of This Approach:
   - Incorporates clinical knowledge
   - Creates interpretable features
   - Handles missingness informatively
   - Captures longitudin