In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.datasets import mnist
#This function is actually part of the TensorFlow/Keras library, not scikit-learn.

In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()
#The MNIST dataset is a widely used collection of handwritten digits, commonly used for training and testing machine learning models, especially in the field of computer vision and deep learning. 
#Here's a more detailed explanation:
#Dataset Content:
#It contains 70,000 images of handwritten digits (0-9). Each image is a 28x28 pixel grayscale image. The dataset is split into 60,000 training images and 10,000 test images.
#x_train: 60,000 training images (shape: (60000, 28, 28)) y_train: 60,000 labels for the training images (shape: (60000,)) x_test: 10,000 test images (shape: (10000, 28, 28)) y_test: 10,000 labels for the test images (shape: (10000,))

In [None]:
# Original shape and value range
print("Original shape:", x_train.shape)
print("Original value range:", np.min(x_train), "-", np.max(x_train))

# Normalize
x_train_normalized = x_train.astype('float32') / 255
x_test_normalized = x_test.astype('float32') / 255 
print("Normalized value range:", np.min(x_train_normalized), "-", np.max(x_train_normalized))

# Reshape for a dense neural network
x_train_reshaped = x_train_normalized.reshape(-1, 784)
x_test_reshaped = x_test_normalized.reshape(-1, 784)
print("Reshaped for dense layer:", x_train_reshaped.shape)
#Your input data (likely MNIST images) is probably in a format like (60000, 28, 28) - meaning 60000 images that are 28x28 pixels
#A dense/fully connected layer expects a single flat vector as input, not a 2D image
#The reshape operation flattens each 28x28 image into a single vector of 784 pixels (28 * 28 = 784)


#The -1 in detail:  If we have 60,000 training images, the -1 will automatically become 60,000.
#The -1 is a special placeholder that tells numpy "figure out what this dimension should be to make everything fit."

#So you're essentially converting each 2D image matrix into a 1D array that your neural network can process. Think of it like taking a grid of pixels and laying them out in one long line.

#You need to normalize and reshape x_test

Original shape: (60000, 28, 28)
Original value range: 0 - 255
Normalized value range: 0.0 - 1.0
Reshaped for dense layer: (60000, 784)


In [4]:
#If you want both validation and test sets to be 20% each (leaving 60% for training), you'll need to do two splits. 

from sklearn.model_selection import train_test_split
# First split: Separate out the test set (20%)
x_temp, x_test, y_temp, y_test = train_test_split(x_train_reshaped , y_train, 
                                                 test_size=0.2, 
                                                 random_state=42)

# Second split: Split remaining data into training (75%) and validation (25%)
# 75% of 80% ≈ 60% of original data
# 25% of 80% ≈ 20% of original data
x_train, x_val, y_train, y_val = train_test_split(x_temp, y_temp, 
                                                 test_size=0.25, 
                                                 random_state=42)

# Let's verify the sizes
print(f"Training set: {len(x_train)} samples ({len(x_train)/len(x_train_normalized)*100:.1f}%)")
print(f"Validation set: {len(x_val)} samples ({len(x_val)/len(x_train_normalized)*100:.1f}%)")
print(f"Test set: {len(x_test)} samples ({len(x_test)/len(x_train_normalized)*100:.1f}%)")

Training set: 36000 samples (60.0%)
Validation set: 12000 samples (20.0%)
Test set: 12000 samples (20.0%)


In [None]:

from tensorflow.keras.layers import Dense
from tensorflow.keras import Sequential
models = []
cv_errors = []

# Define different architectures to try
# Each list represents the number of units in each hidden layer
architectures = [
  [32],  # Single hidden layer with 32 units
[64, 32],  # Two hidden layers: 64 units, then 32 units
[128, 64, 32],  # Three hidden layers: 128, 64, then 32 units
[256, 128],  # Two hidden layers: 256 units, then 128 units
[512, 256, 128],  # Three hidden layers: 512, 256, then 128 units
[64, 64, 64],  # Three hidden layers with 64 units each
[256],  # Single hidden layer with 256 units
[128, 64],  # Two hidden layers: 128 units, then 64 units
[512, 256, 128, 64],  # Four hidden layers: 512, 256, 128, then 64 units
[1024, 512]  # Two hidden layers: 1024 units, then 512 units
]

for i, hidden_units in enumerate(architectures):
    print(f"\nTraining Model {i+1} with architecture: {hidden_units}")
    
    # Build model with specified architecture
    model = Sequential()
    # Input layer
    #model.add(tf.keras.Input(shape=(784,)))
    
    #or
    model.add(Dense(hidden_units[0], activation='relu', input_shape=(784,)))
#so in our model selection , this line of code does two important things, it would add the first dense layer as well as the input to the model 
#creates the input layer: input_shape=(784,) tells the model to expect inputs of 784 features (our flattened 28×28 images)
#This automatically sets up the input layer though we don't explicitly write it
#The hidden_units[0] with relu; this layer will be fully connected to all 784 input features


#The model always has an input layer (784 units for MNIST) and an output layer (10 units for MNIST).
#So a model with architecture [32] actually has three layers total: input (784) -> hidden (32) -> output (10).  
#The input layer in this code isn't explicitly defined with Dense(). Instead, it's implicitly defined by the input_shape parameter in the first Dense layer.

# it's because the neurons are interconnected with the input shape
#So, 784 is not the number of units in a layer, but the number of input features. The actual number of units in the first hidden layer is hidden_units[0].
#so it shows that the first hidden layer is directly connect to the input x(vector)
#This direct connection allows the first hidden layer to learn features directly from the raw input data. Subsequent layers then learn higher-level features based on the outputs of the previous layer.
 # Hidden layers
    for units in hidden_units[1:]:
        model.add(Dense(units, activation='relu'))
    
    # Output layer (always 10 units for MNIST)
    model.add(Dense(10, activation='softmax'))
    
    model.compile(optimizer='adam',
                 loss='sparse_categorical_crossentropy')
    
    # Train model
    history = model.fit(x_train, y_train,
                       validation_data=(x_val, y_val),
                       epochs=5,
                       verbose=1)
#Validation data (x_val/y_val) is like a practice test,the model doesn't learn from validation data, but uses it to check its performance
#This helps detect overfitting - if the model does great on training data but poorly on validation data, it's memorizing instead of learning
#With verbose=1, you get a progress bar and regular updates on loss and metrics for each epoch.
 
    # Store validation error
    cv_error = history.history['val_loss'][-1] # from the history created under fit, we want to get the last val loss when we run the code
    cv_errors.append(cv_error)
    models.append(model)
    
#history.history is a dictionary containing lists of metrics (like loss, accuracy, val_loss, etc.)
#history.history['val_loss'] gives us the list of validation losses for each epoch.
#The -1 index grabs the last value from that list
#Later, you use these collected errors to determine which model performed best on the validation data.

# Find best model
best_model_index = np.argmin(cv_errors)
best_model = models[best_model_index]

#np.argmin() returns the index of the minimum value in an array. So this is identifying which model performed best on the validation data.


print("\nResults:")
print(f"Best model was architecture: {architectures[best_model_index]}")
print(f"Best validation error: {cv_errors[best_model_index]:.4f}")

# Make sure cv_errors is not empty
if len(cv_errors) > 0:
    plt.figure(figsize=(12, 6))
    plt.bar(range(len(cv_errors)), cv_errors)
    plt.xlabel('Model Architecture')
    plt.ylabel('Validation Error')
    plt.title('Model Selection: Validation Error vs Architecture')
    plt.xticks(range(len(cv_errors)), [f"Model {i+1}" for i in range(len(cv_errors))])
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()
else:
    print("cv_errors is empty!")


Training Model 1 with architecture: [32]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - loss: 0.7043 - val_loss: 0.2599
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 6ms/step - loss: 0.2489 - val_loss: 0.2168
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - loss: 0.1870 - val_loss: 0.1907
Epoch 4/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.1654 - val_loss: 0.1753
Epoch 5/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5ms/step - loss: 0.1428 - val_loss: 0.1716

Training Model 2 with architecture: [64, 32]
Epoch 1/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 5ms/step - loss: 0.6321 - val_loss: 0.1987
Epoch 2/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.1765 - val_loss: 0.1630
Epoch 3/5
[1m1125/1125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - loss: 0.1250 - val

In [None]:
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l1
tf.random.set_seed(42)

# Simpler model with only L1 regularization
model = Sequential([
   tf.keras.Input(shape=(784,)),
   # First hidden layer with L1 only
   Dense(128, activation='relu', 
         kernel_regularizer=l1(0.001)),
   # Second hidden layer
   Dense(64, activation='relu',
         kernel_regularizer=l1(0.001)),
   # Output layer
   Dense(10, activation='linear',
         kernel_regularizer=l1(0.001))
])

# Compile with lower learning rate
model.compile(
   optimizer=Adam(learning_rate=1e-4),
   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
)

# Train without early stopping
history = model.fit(
   x_train, 
   y_train,
   validation_data=(x_val, y_val),
   epochs=35,
   batch_size=32,
   verbose=1
)

In [None]:
# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Get predictions
y_pred_prob = model.predict(x_test)
y_pred = np.argmax(y_pred_prob, axis=1)  # Convert probabilities to class labels

# Create and plot confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot using seaborn
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Example of making predictions on a few test images
n_samples = 5
x_samples = x_test[:n_samples]
predictions = model.predict(x_samples)
predicted_digits = np.argmax(predictions, axis=1)

# Visualize the predictions
plt.figure(figsize=(15, 3))
for i in range(n_samples):
    plt.subplot(1, n_samples, i+1)
    plt.imshow(x_samples[i].reshape(28, 28), cmap='gray')
    plt.title(f'Predicted: {predicted_digits[i]}')
    plt.axis('off')
plt.tight_layout()
plt.show()


