In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import time
from tabulate import tabulate

# Load and prepare data
pima_df = pd.read_csv("pima_synthetic.csv")
X = pima_df.drop('outcome', axis=1)
y = pima_df['outcome']

# Split data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Function to create datasets of different sizes
def create_dataset(X, y, size):
    if size > len(X):
        # If requested size is larger, use data augmentation with noise
        factor = int(np.ceil(size / len(X)))
        X_aug = np.tile(X, (factor, 1))
        y_aug = np.tile(y, factor)
        # Add small random noise to augmented data
        noise = np.random.normal(0, 0.01, X_aug.shape)
        X_aug = X_aug + noise
        return X_aug[:size], y_aug[:size]
    else:
        return X[:size], y[:size]

# Configurations to test
configs = [
    (1000, (4,), "1 hidden layer 4 nodes"),
    (10000, (4,), "1 hidden layer 4 nodes"),
    (100000, (4,), "1 hidden layer 4 nodes"),
    (1000, (4, 4), "2 hidden layers of 4 nodes each"),
    (10000, (4, 4), "2 hidden layers of 4 nodes each"),
    (100000, (4, 4), "2 hidden layers of 4 nodes each")
]

results = []

for data_size, hidden_layers, config_name in configs:
    # Create training dataset of specified size
    X_train_subset, y_train_subset = create_dataset(X_train_scaled, y_train, data_size)

    # Create and train model
    model = MLPClassifier(
        hidden_layer_sizes=hidden_layers,
        activation='relu',  # Fast and effective activation function
        solver='adam',      # Efficient solver
        max_iter=100,       # Limit iterations for speed
        random_state=42,
        early_stopping=True,  # Stop early if validation performance plateaus
        n_iter_no_change=5,
        validation_fraction=0.1  # Use small internal validation set
    )

    # Measure execution time
    start_time = time.time()
    model.fit(X_train_subset, y_train_subset)
    execution_time = time.time() - start_time

    # Calculate errors
    train_score = model.score(X_train_subset, y_train_subset)
    val_score = model.score(X_val_scaled, y_val)

    train_error = 1 - train_score
    val_error = 1 - val_score

    results.append([data_size, config_name, f"{train_error:.4f}", f"{val_error:.4f}", f"{execution_time:.3f}"])

# Display results
headers = ["Data size", "Configuration", "Training error", "Validation error", "Time of execution"]
print(tabulate(results, headers=headers, tablefmt="grid"))

+-------------+---------------------------------+------------------+--------------------+---------------------+
|   Data size | Configuration                   |   Training error |   Validation error |   Time of execution |
|        1000 | 1 hidden layer 4 nodes          |           0.238  |             0.2528 |               0.041 |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|       10000 | 1 hidden layer 4 nodes          |           0.0111 |             0.012  |               0.688 |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|      100000 | 1 hidden layer 4 nodes          |           0.0006 |             0.0006 |               4.475 |
+-------------+---------------------------------+------------------+--------------------+---------------------+
|        1000 | 2 hidden layers of 4 nodes each |           0.226  |             0.238  |               