# Preprocessing Strategy 1: Simple Imputation
## Approach: Mean/Median for numeric features, Mode for categorical features

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

## 1. Load Data

In [2]:
df = pd.read_csv('clinical_genotype_HGB.csv')
print(f"Original shape: {df.shape}")
print(f"Target variable distribution:\n{df['undetectable'].value_counts()}")

Original shape: (45920, 48)
Target variable distribution:
undetectable
0.0    21947
1.0    11064
Name: count, dtype: int64


## 2. Feature Selection and Engineering

In [3]:
# Define features to exclude (identifiers, dates, target, redundant)
exclude_features = [
    'wihsid', 'bsdate', 'bsvisit', 'dob', 'date',  # Identifiers and dates
    'lnegdate', 'fposdate', 'frstartd', 'frstaidd', 'frstdthd',  # Date features
    'undetectable',  # Target variable
    'HIV',  # Redundant with target
    'r',  # Reference variable
    'vload',  # Use logvl instead
    'CD4N',  # Use sqrtcd4 instead
    'CD8N',  # Use sqrtcd8 instead
]

# Select all features except excluded ones
feature_cols = [col for col in df.columns if col not in exclude_features]

print(f"Number of features selected: {len(feature_cols)}")
print(f"\nFeatures: {feature_cols}")

Number of features selected: 32

Features: ['status', 'visit', 'race', 'anydrug', 'ageatvis', 'nrti', 'nnrti', 'pi', 'hemoglob', 'n', 'call', 'genotype', 'logvl', 'sqrtcd4', 'sqrtcd8', 'duration', 'durationy', 'cd8a', 'vla', 'genotype3', 'CD4_8', 'APOBEC', 'APOB', 'APOBgr', 'Hgb', 'Hgbgen', 'HgbgenSS', 'apofer', 'ferss', 'aposs', 'APOBgr2', 'N']


In [4]:
# Separate features and target
X = df[feature_cols].copy()
y = df['undetectable'].copy()

# Remove rows where target is missing
mask = y.notna()
X = X[mask]
y = y[mask]

print(f"Data shape after removing missing targets: {X.shape}")
print(f"Target distribution:\n{y.value_counts()}")
print(f"Class balance: {y.value_counts(normalize=True)}")

Data shape after removing missing targets: (33011, 32)
Target distribution:
undetectable
0.0    21947
1.0    11064
Name: count, dtype: int64
Class balance: undetectable
0.0    0.664839
1.0    0.335161
Name: proportion, dtype: float64


## 3. Identify Feature Types

In [5]:
# Identify numeric and categorical features
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f"Numeric features ({len(numeric_features)}): {numeric_features}")
print(f"\nCategorical features ({len(categorical_features)}): {categorical_features}")

Numeric features (23): ['status', 'visit', 'race', 'anydrug', 'ageatvis', 'nrti', 'nnrti', 'pi', 'hemoglob', 'n', 'logvl', 'sqrtcd4', 'sqrtcd8', 'duration', 'durationy', 'cd8a', 'vla', 'CD4_8', 'apofer', 'ferss', 'aposs', 'APOBgr2', 'N']

Categorical features (9): ['call', 'genotype', 'genotype3', 'APOBEC', 'APOB', 'APOBgr', 'Hgb', 'Hgbgen', 'HgbgenSS']


## 4. Split Data (Before Imputation to Prevent Data Leakage)

In [6]:
# Split into train and test sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")
print(f"\nTraining set class distribution:\n{y_train.value_counts(normalize=True)}")
print(f"\nTest set class distribution:\n{y_test.value_counts(normalize=True)}")

Training set: (26408, 32)
Test set: (6603, 32)

Training set class distribution:
undetectable
0.0    0.664836
1.0    0.335164
Name: proportion, dtype: float64

Test set class distribution:
undetectable
0.0    0.664849
1.0    0.335151
Name: proportion, dtype: float64


## 5. Simple Imputation Strategy

In [7]:
# Strategy: Median for numeric (robust to outliers), Most frequent for categorical

# Numeric imputation with median
numeric_imputer = SimpleImputer(strategy='median')
X_train_numeric = numeric_imputer.fit_transform(X_train[numeric_features])
X_test_numeric = numeric_imputer.transform(X_test[numeric_features])

# Convert back to DataFrame
X_train_numeric = pd.DataFrame(X_train_numeric, columns=numeric_features, index=X_train.index)
X_test_numeric = pd.DataFrame(X_test_numeric, columns=numeric_features, index=X_test.index)

print(f"Numeric features imputed with median")
print(f"Training set missing values after imputation: {X_train_numeric.isnull().sum().sum()}")
print(f"Test set missing values after imputation: {X_test_numeric.isnull().sum().sum()}")

Numeric features imputed with median
Training set missing values after imputation: 0
Test set missing values after imputation: 0


In [8]:
# Categorical imputation with most frequent value
if len(categorical_features) > 0:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X_train_categorical = categorical_imputer.fit_transform(X_train[categorical_features])
    X_test_categorical = categorical_imputer.transform(X_test[categorical_features])
    
    # Convert back to DataFrame
    X_train_categorical = pd.DataFrame(X_train_categorical, columns=categorical_features, index=X_train.index)
    X_test_categorical = pd.DataFrame(X_test_categorical, columns=categorical_features, index=X_test.index)
    
    print(f"Categorical features imputed with most frequent value")
    print(f"Training set missing values after imputation: {X_train_categorical.isnull().sum().sum()}")
    print(f"Test set missing values after imputation: {X_test_categorical.isnull().sum().sum()}")
else:
    print("No categorical features to impute")
    X_train_categorical = pd.DataFrame(index=X_train.index)
    X_test_categorical = pd.DataFrame(index=X_test.index)

Categorical features imputed with most frequent value
Training set missing values after imputation: 0
Test set missing values after imputation: 0


## 6. Encode Categorical Variables

In [9]:
# Label encode categorical features
if len(categorical_features) > 0:
    label_encoders = {}
    for col in categorical_features:
        le = LabelEncoder()
        X_train_categorical[col] = le.fit_transform(X_train_categorical[col].astype(str))
        X_test_categorical[col] = le.transform(X_test_categorical[col].astype(str))
        label_encoders[col] = le
    
    print(f"Categorical features encoded")
    print(f"Label encoders created for: {list(label_encoders.keys())}")

Categorical features encoded
Label encoders created for: ['call', 'genotype', 'genotype3', 'APOBEC', 'APOB', 'APOBgr', 'Hgb', 'Hgbgen', 'HgbgenSS']


## 7. Combine Features

In [10]:
# Combine numeric and categorical features
X_train_imputed = pd.concat([X_train_numeric, X_train_categorical], axis=1)
X_test_imputed = pd.concat([X_test_numeric, X_test_categorical], axis=1)

print(f"Training set shape after imputation: {X_train_imputed.shape}")
print(f"Test set shape after imputation: {X_test_imputed.shape}")
print(f"\nMissing values in training set: {X_train_imputed.isnull().sum().sum()}")
print(f"Missing values in test set: {X_test_imputed.isnull().sum().sum()}")

Training set shape after imputation: (26408, 32)
Test set shape after imputation: (6603, 32)

Missing values in training set: 0
Missing values in test set: 0


## 8. Feature Scaling

In [11]:
# Standardize features (important for logistic regression and neural networks)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

# Convert back to DataFrame for easier handling
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train_imputed.columns, index=X_train_imputed.index)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test_imputed.columns, index=X_test_imputed.index)

print(f"Features scaled using StandardScaler")
print(f"\nScaled training set statistics:")
print(X_train_scaled.describe())

Features scaled using StandardScaler

Scaled training set statistics:
        status         visit          race       anydrug      ageatvis  \
count  26408.0  2.640800e+04  2.640800e+04  2.640800e+04  2.640800e+04   
mean       0.0 -1.300922e-16 -1.115537e-15  6.322991e-18 -4.533719e-16   
std        0.0  1.000019e+00  1.000019e+00  1.000019e+00  1.000019e+00   
min        0.0 -1.657450e+00 -1.953785e-01 -5.838218e-01 -2.848083e+00   
25%        0.0 -8.870137e-01 -1.953785e-01 -5.838218e-01 -7.051993e-01   
50%        0.0 -3.097343e-02 -1.953785e-01 -5.838218e-01 -2.725891e-02   
75%        0.0  8.250668e-01 -1.953785e-01  1.712851e+00  6.736997e-01   
max        0.0  1.681107e+00  5.118272e+00  1.712851e+00  3.791020e+00   

               nrti         nnrti            pi      hemoglob             n  \
count  2.640800e+04  2.640800e+04  2.640800e+04  2.640800e+04  2.640800e+04   
mean   1.167735e-16  5.603246e-17  4.365554e-17 -1.379757e-15 -8.354420e-17   
std    1.000019e+00  1.000

## 9. Save Preprocessed Data

In [12]:
# Save preprocessed data and preprocessing objects
import pickle

# Create directory for preprocessed data
import os
os.makedirs('preprocessed_data', exist_ok=True)

# Save data
X_train_scaled.to_csv('preprocessed_data/strategy1_X_train.csv', index=False)
X_test_scaled.to_csv('preprocessed_data/strategy1_X_test.csv', index=False)
y_train.to_csv('preprocessed_data/strategy1_y_train.csv', index=False, header=['undetectable'])
y_test.to_csv('preprocessed_data/strategy1_y_test.csv', index=False, header=['undetectable'])

# Save preprocessing objects
preprocessing_objects = {
    'numeric_imputer': numeric_imputer,
    'categorical_imputer': categorical_imputer if len(categorical_features) > 0 else None,
    'label_encoders': label_encoders if len(categorical_features) > 0 else None,
    'scaler': scaler,
    'feature_cols': feature_cols,
    'numeric_features': numeric_features,
    'categorical_features': categorical_features
}

with open('preprocessed_data/strategy1_preprocessing_objects.pkl', 'wb') as f:
    pickle.dump(preprocessing_objects, f)

print("Preprocessed data saved to 'preprocessed_data/' directory")
print(f"  - strategy1_X_train.csv: {X_train_scaled.shape}")
print(f"  - strategy1_X_test.csv: {X_test_scaled.shape}")
print(f"  - strategy1_y_train.csv: {y_train.shape}")
print(f"  - strategy1_y_test.csv: {y_test.shape}")
print(f"  - strategy1_preprocessing_objects.pkl")

Preprocessed data saved to 'preprocessed_data/' directory
  - strategy1_X_train.csv: (26408, 32)
  - strategy1_X_test.csv: (6603, 32)
  - strategy1_y_train.csv: (26408,)
  - strategy1_y_test.csv: (6603,)
  - strategy1_preprocessing_objects.pkl


## 10. Summary Statistics

In [13]:
print("="*70)
print("PREPROCESSING STRATEGY 1 - SIMPLE IMPUTATION SUMMARY")
print("="*70)

print(f"\n1. Imputation Strategy:")
print(f"   - Numeric features: Median imputation")
print(f"   - Categorical features: Most frequent value imputation")

print(f"\n2. Data Splits:")
print(f"   - Training set: {X_train_scaled.shape[0]:,} samples ({X_train_scaled.shape[0]/len(X)*100:.1f}%)")
print(f"   - Test set: {X_test_scaled.shape[0]:,} samples ({X_test_scaled.shape[0]/len(X)*100:.1f}%)")

print(f"\n3. Features:")
print(f"   - Total features: {X_train_scaled.shape[1]}")
print(f"   - Numeric features: {len(numeric_features)}")
print(f"   - Categorical features: {len(categorical_features)}")

print(f"\n4. Target Distribution (Training):")
print(f"   - Suppressed (1): {(y_train==1).sum():,} ({(y_train==1).mean()*100:.2f}%)")
print(f"   - Not Suppressed (0): {(y_train==0).sum():,} ({(y_train==0).mean()*100:.2f}%)")
print(f"   - Class imbalance ratio: {(y_train==0).sum()/(y_train==1).sum():.2f}:1")

print(f"\n5. Scaling:")
print(f"   - Method: StandardScaler (mean=0, std=1)")

print(f"\n6. Data Quality:")
print(f"   - Missing values in training: {X_train_scaled.isnull().sum().sum()}")
print(f"   - Missing values in test: {X_test_scaled.isnull().sum().sum()}")
print(f"   - Infinite values in training: {np.isinf(X_train_scaled.values).sum()}")
print(f"   - Infinite values in test: {np.isinf(X_test_scaled.values).sum()}")

print("\n" + "="*70)
print("Data ready for modeling!")
print("="*70)

PREPROCESSING STRATEGY 1 - SIMPLE IMPUTATION SUMMARY

1. Imputation Strategy:
   - Numeric features: Median imputation
   - Categorical features: Most frequent value imputation

2. Data Splits:
   - Training set: 26,408 samples (80.0%)
   - Test set: 6,603 samples (20.0%)

3. Features:
   - Total features: 32
   - Numeric features: 23
   - Categorical features: 9

4. Target Distribution (Training):
   - Suppressed (1): 8,851 (33.52%)
   - Not Suppressed (0): 17,557 (66.48%)
   - Class imbalance ratio: 1.98:1

5. Scaling:
   - Method: StandardScaler (mean=0, std=1)

6. Data Quality:
   - Missing values in training: 0
   - Missing values in test: 0
   - Infinite values in training: 0
   - Infinite values in test: 0

Data ready for modeling!
