<a href="https://colab.research.google.com/github/AlphaKermit-999/AI-Topics-learnings/blob/main/Advanced_AI/Copy_of_Distrubuted_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import tensorflow as tf
import numpy as np

# Create a distribution strategy to enable multi-GPU training
# MirroredStrategy will copy the model to each GPU and combine gradients
distribute_strategy = tf.distribute.MirroredStrategy()

# Print the number of devices being used by the strategy
print(f"Number of devices used: {distribute_strategy.num_replicas_in_sync}")


Number of devices used: 1


In [2]:
# Function to create a simple neural network model
def create_model():
    return tf.keras.Sequential([
        tf.keras.layers.Flatten(input_shape=(28, 28)),  # Flatten the 28x28 input images
        tf.keras.layers.Dense(128, activation='relu'),  # Fully connected layer with ReLU activation
        tf.keras.layers.Dense(10, activation='softmax')  # Output layer with softmax activation for 10 classes
    ])


In [3]:
# Load and preprocess the MNIST dataset
# Normalize pixel values to the range [0, 1]
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# Batch size for training
BATCH_SIZE = 64

# Create TensorFlow datasets for training and testing
train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE)



In [5]:
# Distribute the datasets across GPUs
train_dist_dataset = distribute_strategy.experimental_distribute_dataset(train_dataset)
test_dist_dataset = distribute_strategy.experimental_distribute_dataset(test_dataset)



In [6]:
# Define the loss function
# The reduction is NONE to ensure proper scaling across replicas
with distribute_strategy.scope():
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)

    # Compute average loss across all replicas
    def compute_loss(labels, predictions):
        per_example_loss = loss_object(labels, predictions)
        return tf.nn.compute_average_loss(per_example_loss, global_batch_size=BATCH_SIZE)

    # Define the optimizer
    optimizer = tf.keras.optimizers.Adam()

    # Create the model within the scope of the strategy
    model = create_model()

    # Metrics to track training and testing accuracy
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')
    test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')



  super().__init__(**kwargs)


In [7]:
# Training step for a batch
@tf.function
def train_step(inputs):
    images, labels = inputs  # Unpack the input batch

    with tf.GradientTape() as tape:
        predictions = model(images, training=True)  # Forward pass
        loss = compute_loss(labels, predictions)  # Compute loss

    # Compute and apply gradients
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    # Update training accuracy metric
    train_accuracy.update_state(labels, predictions)

    return loss

# Testing step for a batch
@tf.function
def test_step(inputs):
    images, labels = inputs  # Unpack the input batch
    predictions = model(images, training=False)  # Forward pass

    # Update testing accuracy metric
    test_accuracy.update_state(labels, predictions)

# Main training loop
EPOCHS = 5
for epoch in range(EPOCHS):
    print(f"Starting epoch {epoch + 1}")
    total_loss = 0.0
    num_batches = 0

    # Train on all batches
    for batch in train_dist_dataset:
        per_replica_loss = distribute_strategy.run(train_step, args=(batch,))
        total_loss += distribute_strategy.reduce(tf.distribute.ReduceOp.SUM, per_replica_loss, axis=None)
        num_batches += 1

    # Test on all batches
    for batch in test_dist_dataset:
        distribute_strategy.run(test_step, args=(batch,))

    # Print metrics for the epoch
    print(f"Epoch {epoch + 1}, Loss: {total_loss / num_batches}, "
          f"Train Accuracy: {train_accuracy.result() * 100:.2f}%, "
          f"Test Accuracy: {test_accuracy.result() * 100:.2f}%")

    # Reset metrics for the next epoch
    train_accuracy.reset_state()
    test_accuracy.reset_state()

print("Training complete!")


Starting epoch 1
Epoch 1, Loss: 0.2952558398246765, Train Accuracy: 91.83%, Test Accuracy: 95.45%
Starting epoch 2
Epoch 2, Loss: 0.13493984937667847, Train Accuracy: 96.08%, Test Accuracy: 96.76%
Starting epoch 3
Epoch 3, Loss: 0.09515645354986191, Train Accuracy: 97.16%, Test Accuracy: 97.00%
Starting epoch 4
Epoch 4, Loss: 0.07163714617490768, Train Accuracy: 97.90%, Test Accuracy: 97.18%
Starting epoch 5
Epoch 5, Loss: 0.05744912475347519, Train Accuracy: 98.23%, Test Accuracy: 96.99%
Training complete!
