# Diabetes Hospital Readmission Data Preprocessing

Loading and preprocessing the UCI Diabetes 130-US Hospitals dataset for readmission prediction.

## 1. Load Libraries and Data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Set display options
pd.set_option('display.max_columns', 50)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

%matplotlib inline

In [None]:
# Load the dataset
df = pd.read_csv('../data/diabetic_data.csv')

print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
df.head()

## 2. Basic Data Information

In [None]:
# Display basic info
print("=" * 50)
print("Dataset Shape:")
print("="*50)
print(f"Rows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")

print("\n" + "="*50)
print("Data Types:")
print("="*50)
print(df.dtypes.value_counts())

In [None]:
# Detailed info
df.info()

In [None]:
# Check for missing values (including '?' which is common in this dataset)
print("Missing Values Analysis:")
print("="*50)

# Count '?' as missing
missing_counts = (df == '?').sum()
missing_pct = (missing_counts / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Percentage': missing_pct
})

missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Percentage', ascending=False)
print(missing_df)

## 3. Target Variable Analysis: `readmitted`

In [None]:
# Analyze readmitted column
print("Readmitted Value Counts:")
print("="*50)
print(df['readmitted'].value_counts())
print("\nPercentages:")
print(df['readmitted'].value_counts(normalize=True) * 100)

In [None]:
# Visualize readmitted distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Count plot
readmit_counts = df['readmitted'].value_counts()
axes[0].bar(readmit_counts.index, readmit_counts.values, 
            color=['green', 'orange', 'red'], alpha=0.7)
axes[0].set_xlabel('Readmitted Category')
axes[0].set_ylabel('Count')
axes[0].set_title('Distribution of Readmission Status')
axes[0].grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (idx, val) in enumerate(readmit_counts.items()):
    axes[0].text(i, val, f'{val:,}', ha='center', va='bottom')

# Percentage plot
readmit_pct = df['readmitted'].value_counts(normalize=True) * 100
axes[1].bar(readmit_pct.index, readmit_pct.values, 
            color=['green', 'orange', 'red'], alpha=0.7)
axes[1].set_xlabel('Readmitted Category')
axes[1].set_ylabel('Percentage (%)')
axes[1].set_title('Distribution of Readmission Status (%)')
axes[1].grid(axis='y', alpha=0.3)

# Add percentage labels
for i, (idx, val) in enumerate(readmit_pct.items()):
    axes[1].text(i, val, f'{val:.1f}%', ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 4. Feature Selection

In [None]:
# Define the subset of useful features
selected_features = [
    # Demographics
    'race', 'gender', 'age',
    
    # Hospital stay metrics
    'time_in_hospital',
    
    # Procedure counts
    'num_lab_procedures',
    'num_procedures',
    'num_medications',
    
    # Outpatient/Emergency visits
    'number_outpatient',
    'number_inpatient',
    'number_emergency',
    
    # Lab results
    'A1Cresult',
    'max_glu_serum',
    
    # Medication changes
    'change',
    'diabetesMed'
]

# Verify all features exist
missing_features = [f for f in selected_features if f not in df.columns]
if missing_features:
    print(f"Warning: Missing features: {missing_features}")
else:
    print("✓ All selected features are present in the dataset")

print(f"\nSelected {len(selected_features)} features")
print(selected_features)

In [None]:
# Create working dataframe with selected features + target
df_subset = df[selected_features + ['readmitted']].copy()

print(f"Subset shape: {df_subset.shape}")
df_subset.head()

## 5. Data Cleaning

In [None]:
# Replace '?' with NaN
df_clean = df_subset.replace('?', np.nan)

print("Replaced '?' with NaN")
print("\nMissing values per column:")
missing_info = df_clean.isnull().sum()
missing_info = missing_info[missing_info > 0].sort_values(ascending=False)
print(missing_info)

In [None]:
# Drop rows with missing values in critical features
# (Alternative: you could impute, but for simplicity we'll drop)
print(f"Rows before cleaning: {len(df_clean):,}")

df_clean = df_clean.dropna()

print(f"Rows after cleaning: {len(df_clean):,}")
print(f"Rows dropped: {len(df_subset) - len(df_clean):,} ({(len(df_subset) - len(df_clean))/len(df_subset)*100:.1f}%)")

In [None]:
# Verify no missing values remain
print("Remaining missing values:")
print(df_clean.isnull().sum().sum())

## 6. Create Binary Target Variable

In [None]:
# Create binary target: y = 1 if readmitted < 30 days, else 0
y = (df_clean['readmitted'] == '<30').astype(int)

print("Binary Target Distribution:")
print("="*50)
print(f"y = 1 (readmitted < 30 days): {(y == 1).sum():,} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"y = 0 (not readmitted < 30): {(y == 0).sum():,} ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"\nClass imbalance ratio: {(y == 0).sum() / (y == 1).sum():.2f}:1")

In [None]:
# Visualize binary target
plt.figure(figsize=(8, 5))
y_counts = y.value_counts()
plt.bar(['Not Readmitted <30', 'Readmitted <30'], 
        [y_counts[0], y_counts[1]], 
        color=['green', 'red'], alpha=0.7)
plt.ylabel('Count')
plt.title('Binary Classification Target Distribution')
plt.grid(axis='y', alpha=0.3)

# Add counts on bars
for i, val in enumerate([y_counts[0], y_counts[1]]):
    plt.text(i, val, f'{val:,}\n({val/len(y)*100:.1f}%)', 
             ha='center', va='bottom')

plt.tight_layout()
plt.show()

## 7. Feature Encoding and Preprocessing

In [None]:
# Separate features from target
X = df_clean.drop('readmitted', axis=1)

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()

print(f"\nCategorical features ({len(categorical_cols)}): {categorical_cols}")
print(f"\nNumerical features ({len(numerical_cols)}): {numerical_cols}")

In [None]:
# Create preprocessing pipeline
from sklearn.preprocessing import StandardScaler

# Define transformers
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore'), 
         categorical_cols)
    ],
    remainder='passthrough'
)

print("Preprocessing pipeline created:")
print(preprocessor)

In [None]:
# Fit and transform the feature matrix
X_transformed = preprocessor.fit_transform(X)

print(f"Original feature matrix shape: {X.shape}")
print(f"Transformed feature matrix shape: {X_transformed.shape}")
print(f"\nFeatures expanded due to one-hot encoding: {X_transformed.shape[1] - X.shape[1]}")

In [None]:
# Get feature names after transformation
feature_names = []

# Numerical features (scaled)
feature_names.extend(numerical_cols)

# Categorical features (one-hot encoded)
if hasattr(preprocessor.named_transformers_['cat'], 'get_feature_names_out'):
    cat_features = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
    feature_names.extend(cat_features)

print(f"Total features after encoding: {len(feature_names)}")
print(f"\nFirst 20 feature names: {feature_names[:20]}")

In [None]:
# Create a clean DataFrame with transformed features
X_final = pd.DataFrame(
    X_transformed, 
    columns=feature_names,
    index=X.index
)

print("Final feature matrix:")
print(X_final.head())
print(f"\nShape: {X_final.shape}")

## 8. Final Dataset Summary

In [None]:
print("="*70)
print("FINAL PREPROCESSED DATASET SUMMARY")
print("="*70)
print(f"\nSamples: {X_final.shape[0]:,}")
print(f"Features: {X_final.shape[1]:,}")
print(f"\nTarget distribution:")
print(f"  Class 0 (not readmitted <30): {(y == 0).sum():,} ({(y == 0).sum()/len(y)*100:.1f}%)")
print(f"  Class 1 (readmitted <30):     {(y == 1).sum():,} ({(y == 1).sum()/len(y)*100:.1f}%)")
print(f"\nFeature types:")
print(f"  Original numerical: {len(numerical_cols)}")
print(f"  Original categorical: {len(categorical_cols)}")
print(f"  After one-hot encoding: {X_final.shape[1]}")
print("\n" + "="*70)

In [None]:
# Display sample statistics
print("Sample statistics (first 5 numerical features):")
X_final.iloc[:, :5].describe()

## 9. Save Preprocessed Data (Optional)

In [None]:
# Optional: Save preprocessed data for later use
import pickle
from pathlib import Path

# Create output directory
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

# Save features and target
X_final.to_csv(output_dir / 'X_features.csv', index=False)
y.to_csv(output_dir / 'y_target.csv', index=False, header=['target'])

# Save preprocessor for future use
with open(output_dir / 'preprocessor.pkl', 'wb') as f:
    pickle.dump(preprocessor, f)

print("✓ Preprocessed data saved to data/processed/")
print(f"  - X_features.csv ({X_final.shape})")
print(f"  - y_target.csv ({y.shape})")
print(f"  - preprocessor.pkl")

## Ready for Modeling!

You now have:
- **`X_final`**: Clean feature matrix (numerical features scaled, categorical one-hot encoded)
- **`y`**: Binary target (1 = readmitted within 30 days, 0 = otherwise)
- **`preprocessor`**: Fitted sklearn pipeline for transforming new data

Next steps:
1. Train/test split
2. Model training (e.g., Logistic Regression, Random Forest, XGBoost)
3. Evaluation (ROC-AUC, Precision-Recall, etc.)
4. Feature importance analysis