In [9]:
#imports
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
import warnings
warnings.filterwarnings('ignore')

In [10]:
# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

In [12]:
# 1. DATA PREPARATION
print("Loading dataset...")
df = pd.read_csv('support2.csv')

# Remove rows where target variable (death) is missing
print(f"Original dataset shape: {df.shape}")
df = df.dropna(subset=['death'])
print(f"After removing missing death values: {df.shape}")

# Separate features and target
X = df.drop('death', axis=1)
y = df['death']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

print(f"\nNumerical columns: {len(numerical_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

# Handle missing values for numerical features (median)
if numerical_cols:
    numerical_imputer = SimpleImputer(strategy='median')
    X[numerical_cols] = numerical_imputer.fit_transform(X[numerical_cols])

# Handle missing values for categorical features (most frequent)
if categorical_cols:
    categorical_imputer = SimpleImputer(strategy='most_frequent')
    X[categorical_cols] = categorical_imputer.fit_transform(X[categorical_cols])

# Encode categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

print(f"\nFinal feature matrix shape: {X.shape}")
print(f"Target variable shape: {y.shape}")

# Convert to numpy arrays
X = X.values
y = y.values


Loading dataset...
Original dataset shape: (9105, 48)
After removing missing death values: (9105, 48)

Numerical columns: 39
Categorical columns: 8

Final feature matrix shape: (9105, 47)
Target variable shape: (9105,)


In [13]:
# 2. MODEL BUILDING FUNCTION
def create_regularized_model(input_dim, l1_reg=1e-3, l2_reg=1e-2):
    """
    Create a neural network with regularization techniques:
    - L1 and L2 regularization
    - Dropout
    - Batch Normalization
    """
    model = Sequential([
        # First Dense Layer with L1 and L2 regularization
        Dense(64, 
              activation='relu',
              kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg),
              input_dim=input_dim),
        BatchNormalization(),
        Dropout(0.3),
        
        # Second Dense Layer with L1 and L2 regularization
        Dense(32,
              activation='relu',
              kernel_regularizer=regularizers.l1_l2(l1=l1_reg, l2=l2_reg)),
        BatchNormalization(),
        Dropout(0.3),
        
        # Output Layer
        Dense(1, activation='sigmoid')
    ])
    
    # Compile the model
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

In [14]:
# 3. K-FOLD CROSS-VALIDATION
n_folds = 5
kfold = KFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_accuracies = []

print("\n" + "="*50)
print("Starting 5-Fold Cross-Validation")
print("="*50)

for fold, (train_idx, val_idx) in enumerate(kfold.split(X), 1):
    print(f"\n--- Fold {fold} ---")
    
    # Split data
    X_train_fold, X_val_fold = X[train_idx], X[val_idx]
    y_train_fold, y_val_fold = y[train_idx], y[val_idx]
    
    # Normalize features using StandardScaler
    scaler = StandardScaler()
    X_train_fold = scaler.fit_transform(X_train_fold)
    X_val_fold = scaler.transform(X_val_fold)
    
    # Create and train model
    model = create_regularized_model(
        input_dim=X_train_fold.shape[1],
        l1_reg=1e-3,
        l2_reg=1e-2
    )
    
    # Train the model
    history = model.fit(
        X_train_fold, y_train_fold,
        epochs=30,
        batch_size=32,
        verbose=0,  # Set to 1 to see training progress
        validation_data=(X_val_fold, y_val_fold)
    )
    
    # Evaluate on validation set
    val_loss, val_accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    fold_accuracies.append(val_accuracy)
    
    print(f"Fold {fold} Accuracy: {val_accuracy:.4f}")


Starting 5-Fold Cross-Validation

--- Fold 1 ---
Fold 1 Accuracy: 0.9072

--- Fold 2 ---
Fold 2 Accuracy: 0.9072

--- Fold 3 ---
Fold 3 Accuracy: 0.9099

--- Fold 4 ---
Fold 4 Accuracy: 0.9121

--- Fold 5 ---
Fold 5 Accuracy: 0.9132


In [15]:
# Calculate and print average accuracy
avg_accuracy = np.mean(fold_accuracies)
print("\n" + "="*50)
print(f"Average 5-Fold Accuracy (With Regularization): {avg_accuracy:.4f}")
print("="*50)

# Print individual fold accuracies for reference
print("\nIndividual Fold Accuracies:")
for i, acc in enumerate(fold_accuracies, 1):
    print(f"Fold {i}: {acc:.4f}")


Average 5-Fold Accuracy (With Regularization): 0.9099

Individual Fold Accuracies:
Fold 1: 0.9072
Fold 2: 0.9072
Fold 3: 0.9099
Fold 4: 0.9121
Fold 5: 0.9132
