# Assignment 2 - [Sadman_sharif]_[A1944825]



# EDA (Exploratory Data Analysis)

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# Load the diabetes dataset
# Using the Pima Indians Diabetes dataset as an example
# You should replace this with your actual dataset path
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import LabelBinarizer

# Alternative: Load from CSV if provided
# train_data = pd.read_csv('train.csv')
# test_data = pd.read_csv('test.csv')

# For demonstration, using a public diabetes dataset
# You can also use: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv"
column_names = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
                'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

try:
    data = pd.read_csv(url, names=column_names)
    print("Dataset loaded successfully!")
except:
    # Create synthetic data for demonstration if URL fails
    np.random.seed(42)
    n_samples = 768
    data = pd.DataFrame({
        'Pregnancies': np.random.randint(0, 15, n_samples),
        'Glucose': np.random.randint(50, 200, n_samples),
        'BloodPressure': np.random.randint(40, 120, n_samples),
        'SkinThickness': np.random.randint(10, 60, n_samples),
        'Insulin': np.random.randint(0, 300, n_samples),
        'BMI': np.random.uniform(18, 50, n_samples),
        'DiabetesPedigreeFunction': np.random.uniform(0.1, 2.5, n_samples),
        'Age': np.random.randint(20, 80, n_samples),
        'Outcome': np.random.binomial(1, 0.35, n_samples)
    })
    print("Using synthetic data for demonstration")

print(f"Dataset shape: {data.shape}")
print(f"\nFirst 5 rows:")
data.head()

In [None]:
# Data exploration and visualization
print("Dataset Information:")
print(data.info())
print("\nDataset Statistics:")
print(data.describe())

# Check for missing values
print("\nMissing values:")
print(data.isnull().sum())

# Check class distribution
print("\nClass distribution:")
print(data['Outcome'].value_counts())
print(f"Percentage of diabetic patients: {(data['Outcome'].sum()/len(data))*100:.2f}%")

In [None]:
# Create visualizations
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
fig.suptitle('Feature Distributions by Diabetes Outcome', fontsize=16)

features = data.columns[:-1]
for idx, feature in enumerate(features):
    ax = axes[idx // 4, idx % 4]
    data.boxplot(column=feature, by='Outcome', ax=ax)
    ax.set_title(feature)
    ax.set_xlabel('Diabetes (0=No, 1=Yes)')
    
plt.tight_layout()
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
correlation_matrix = data.corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f')
plt.title('Feature Correlation Matrix')
plt.show()

# Preprocessing

In [None]:
# Handle missing/zero values in certain columns where zeros are biologically impossible
# Glucose, BloodPressure, SkinThickness, Insulin, BMI cannot be zero
zero_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

for column in zero_columns:
    if column in data.columns:
        # Replace zeros with NaN
        data[column] = data[column].replace(0, np.nan)
        # Fill with median of the respective feature
        median_value = data[column].median()
        data[column].fillna(median_value, inplace=True)
        print(f"Replaced zeros in {column} with median: {median_value:.2f}")

In [None]:
# Separate features and target
X = data.drop('Outcome', axis=1)
y = data['Outcome']

# Split data into train, validation, and test sets (60-20-20)
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42, stratify=y_temp)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Validation set size: {X_val.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Feature scaling using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled)
y_train_tensor = torch.FloatTensor(y_train.values.reshape(-1, 1))
X_val_tensor = torch.FloatTensor(X_val_scaled)
y_val_tensor = torch.FloatTensor(y_val.values.reshape(-1, 1))
X_test_tensor = torch.FloatTensor(X_test_scaled)
y_test_tensor = torch.FloatTensor(y_test.values.reshape(-1, 1))

print("\nData preprocessing completed!")

# Model Implementation

In [None]:
class MLPDiabetes(nn.Module):
    """
    Multi-Layer Perceptron for Diabetes Prediction
    
    Architecture:
    - Input layer: 8 features
    - Hidden layers: Configurable number and size
    - Output layer: 1 neuron with sigmoid activation for binary classification
    """
    
    def __init__(self, input_dim, hidden_layers, dropout_rate=0.2):
        """
        Args:
            input_dim: Number of input features
            hidden_layers: List of hidden layer sizes
            dropout_rate: Dropout probability for regularization
        """
        super(MLPDiabetes, self).__init__()
        
        self.layers = nn.ModuleList()
        
        # Input layer to first hidden layer
        prev_dim = input_dim
        
        # Build hidden layers
        for hidden_dim in hidden_layers:
            self.layers.append(nn.Linear(prev_dim, hidden_dim))
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Dropout(dropout_rate))
            prev_dim = hidden_dim
        
        # Output layer
        self.layers.append(nn.Linear(prev_dim, 1))
        self.layers.append(nn.Sigmoid())
        
    def forward(self, x):
        """
        Forward pass through the network
        """
        for layer in self.layers:
            x = layer(x)
        return x
    
    def count_parameters(self):
        """
        Count total trainable parameters
        """
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

# Test the model class
test_model = MLPDiabetes(input_dim=8, hidden_layers=[64, 32])
print(f"Model architecture:")
print(test_model)
print(f"\nTotal trainable parameters: {test_model.count_parameters():,}")

In [None]:
def train_model(model, X_train, y_train, X_val, y_val, epochs, learning_rate, batch_size):
    """
    Train the MLP model
    
    Args:
        model: PyTorch model
        X_train, y_train: Training data
        X_val, y_val: Validation data
        epochs: Number of training epochs
        learning_rate: Learning rate for optimizer
        batch_size: Batch size for training
    
    Returns:
        Dictionary containing training history
    """
    # Move model to device
    model = model.to(device)
    X_train = X_train.to(device)
    y_train = y_train.to(device)
    X_val = X_val.to(device)
    y_val = y_val.to(device)
    
    # Create DataLoader for batching
    train_dataset = TensorDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    
    # Loss function and optimizer
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    # Training history
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_acc': [],
        'val_acc': []
    }
    
    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        correct = 0
        total = 0
        
        for batch_X, batch_y in train_loader:
            # Forward pass
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            
            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
            predicted = (outputs > 0.5).float()
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
        
        # Calculate training metrics
        avg_train_loss = train_loss / len(train_loader)
        train_acc = 100 * correct / total
        
        # Validation
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_val)
            val_loss = criterion(val_outputs, y_val).item()
            val_predicted = (val_outputs > 0.5).float()
            val_acc = 100 * (val_predicted == y_val).sum().item() / y_val.size(0)
        
        # Store history
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(val_loss)
        history['train_acc'].append(train_acc)
        history['val_acc'].append(val_acc)
        
        # Print progress every 10 epochs
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{epochs}], '
                  f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.2f}%, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.2f}%')
    
    return history

In [None]:
def evaluate_model(model, X_test, y_test):
    """
    Evaluate the model on test data
    
    Args:
        model: Trained PyTorch model
        X_test, y_test: Test data
    
    Returns:
        Dictionary containing evaluation metrics
    """
    model.eval()
    X_test = X_test.to(device)
    y_test = y_test.to(device)
    
    with torch.no_grad():
        outputs = model(X_test)
        predicted = (outputs > 0.5).float()
        
        # Convert to numpy for sklearn metrics
        y_true = y_test.cpu().numpy()
        y_pred = predicted.cpu().numpy()
        
        # Calculate metrics
        accuracy = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred, zero_division=0)
        recall = recall_score(y_true, y_pred, zero_division=0)
        f1 = f1_score(y_true, y_pred, zero_division=0)
        
        # Confusion matrix
        cm = confusion_matrix(y_true, y_pred)
        
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm
    }

# Experiments

In [None]:
# Experiment 1: Shallow Network (1 hidden layer)
print("="*60)
print("EXPERIMENT 1: Shallow Network")
print("Architecture: Input(8) -> Hidden(32) -> Output(1)")
print("="*60)

model1 = MLPDiabetes(input_dim=8, hidden_layers=[32], dropout_rate=0.2)
print(f"Total parameters: {model1.count_parameters():,}\n")

history1 = train_model(
    model=model1,
    X_train=X_train_tensor,
    y_train=y_train_tensor,
    X_val=X_val_tensor,
    y_val=y_val_tensor,
    epochs=100,
    learning_rate=0.001,
    batch_size=32
)

# Evaluate on test set
metrics1 = evaluate_model(model1, X_test_tensor, y_test_tensor)
print(f"\nTest Results:")
print(f"Accuracy: {metrics1['accuracy']:.4f}")
print(f"Precision: {metrics1['precision']:.4f}")
print(f"Recall: {metrics1['recall']:.4f}")
print(f"F1-Score: {metrics1['f1']:.4f}")

In [None]:
# Experiment 2: Deep Network (3 hidden layers)
print("="*60)
print("EXPERIMENT 2: Deep Network")
print("Architecture: Input(8) -> Hidden(64) -> Hidden(32) -> Hidden(16) -> Output(1)")
print("="*60)

model2 = MLPDiabetes(input_dim=8, hidden_layers=[64, 32, 16], dropout_rate=0.3)
print(f"Total parameters: {model2.count_parameters():,}\n")

history2 = train_model(
    model=model2,
    X_train=X_train_tensor,
    y_train=y_train_tensor,
    X_val=X_val_tensor,
    y_val=y_val_tensor,
    epochs=100,
    learning_rate=0.001,
    batch_size=32
)

# Evaluate on test set
metrics2 = evaluate_model(model2, X_test_tensor, y_test_tensor)
print(f"\nTest Results:")
print(f"Accuracy: {metrics2['accuracy']:.4f}")
print(f"Precision: {metrics2['precision']:.4f}")
print(f"Recall: {metrics2['recall']:.4f}")
print(f"F1-Score: {metrics2['f1']:.4f}")

In [None]:
# Experiment 3: Wide Network (2 hidden layers with more neurons)
print("="*60)
print("EXPERIMENT 3: Wide Network")
print("Architecture: Input(8) -> Hidden(128) -> Hidden(64) -> Output(1)")
print("="*60)

model3 = MLPDiabetes(input_dim=8, hidden_layers=[128, 64], dropout_rate=0.25)
print(f"Total parameters: {model3.count_parameters():,}\n")

history3 = train_model(
    model=model3,
    X_train=X_train_tensor,
    y_train=y_train_tensor,
    X_val=X_val_tensor,
    y_val=y_val_tensor,
    epochs=100,
    learning_rate=0.0005,  # Lower learning rate for larger model
    batch_size=64
)

# Evaluate on test set
metrics3 = evaluate_model(model3, X_test_tensor, y_test_tensor)
print(f"\nTest Results:")
print(f"Accuracy: {metrics3['accuracy']:.4f}")
print(f"Precision: {metrics3['precision']:.4f}")
print(f"Recall: {metrics3['recall']:.4f}")
print(f"F1-Score: {metrics3['f1']:.4f}")

In [None]:
# Visualize training histories
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Plot training and validation loss
axes[0].plot(history1['train_loss'], label='Exp1: Shallow (Train)', linestyle='-')
axes[0].plot(history1['val_loss'], label='Exp1: Shallow (Val)', linestyle='--')
axes[0].plot(history2['train_loss'], label='Exp2: Deep (Train)', linestyle='-')
axes[0].plot(history2['val_loss'], label='Exp2: Deep (Val)', linestyle='--')
axes[0].plot(history3['train_loss'], label='Exp3: Wide (Train)', linestyle='-')
axes[0].plot(history3['val_loss'], label='Exp3: Wide (Val)', linestyle='--')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training and Validation Loss')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Plot training and validation accuracy
axes[1].plot(history1['train_acc'], label='Exp1: Shallow (Train)', linestyle='-')
axes[1].plot(history1['val_acc'], label='Exp1: Shallow (Val)', linestyle='--')
axes[1].plot(history2['train_acc'], label='Exp2: Deep (Train)', linestyle='-')
axes[1].plot(history2['val_acc'], label='Exp2: Deep (Val)', linestyle='--')
axes[1].plot(history3['train_acc'], label='Exp3: Wide (Train)', linestyle='-')
axes[1].plot(history3['val_acc'], label='Exp3: Wide (Val)', linestyle='--')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy (%)')
axes[1].set_title('Training and Validation Accuracy')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Plot confusion matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

experiments = [
    ('Experiment 1: Shallow', metrics1['confusion_matrix']),
    ('Experiment 2: Deep', metrics2['confusion_matrix']),
    ('Experiment 3: Wide', metrics3['confusion_matrix'])
]

for idx, (title, cm) in enumerate(experiments):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[idx], 
                xticklabels=['No Diabetes', 'Diabetes'],
                yticklabels=['No Diabetes', 'Diabetes'])
    axes[idx].set_title(title)
    axes[idx].set_ylabel('True Label')
    axes[idx].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

In [None]:
# Summary table of results
results_df = pd.DataFrame({
    'Experiment': ['Shallow (1 layer)', 'Deep (3 layers)', 'Wide (2 layers)'],
    'Architecture': ['[32]', '[64, 32, 16]', '[128, 64]'],
    'Parameters': [model1.count_parameters(), model2.count_parameters(), model3.count_parameters()],
    'Test Accuracy': [metrics1['accuracy'], metrics2['accuracy'], metrics3['accuracy']],
    'Precision': [metrics1['precision'], metrics2['precision'], metrics3['precision']],
    'Recall': [metrics1['recall'], metrics2['recall'], metrics3['recall']],
    'F1-Score': [metrics1['f1'], metrics2['f1'], metrics3['f1']]
})

print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(results_df.to_string(index=False))

# Identify best model
best_idx = results_df['Test Accuracy'].idxmax()
print(f"\nBest performing model: {results_df.iloc[best_idx]['Experiment']}")
print(f"Test Accuracy: {results_df.iloc[best_idx]['Test Accuracy']:.4f}")