In [0]:
%load_ext autoreload
%autoreload 2
# Enables autoreload; learn more at https://docs.databricks.com/en/files/workspace-modules.html#autoreload-for-python-modules
# To disable autoreload; run %autoreload 0

import sys
sys.path.append('/Workspace/Users/bjedelma@gmail.com/ScaleML/scaleml')
import scaleml

In [0]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
import time
from datetime import datetime

from scaleml import dynamic_train, log_usage_once, adjust_workers, adjust_batch_size, log_usage_plot, scaleml_folders, dynamic_train, create_model_tf, sys_resources

# Load the MNIST dataset from tensorflow
(train_images, train_labels), (test_images, test_labels) = mnist.load_data()
train_images = train_images.reshape(-1, 28, 28, 1).astype('float32') / 255.0
test_images = test_images.reshape(-1, 28, 28, 1).astype('float32') / 255.0
train_dataset = {'images': train_images, 'labels': train_labels}


In [0]:
# create folders for saving
scaleml_folder = scaleml_folders()
print(f"Scaleml folder and subfolders are set up at: {scaleml_folder}")
print()

# log resource use throughout training
log_dir = f"{scaleml_folder}/logs/"
log_file = f"{log_dir}/{datetime.now().strftime('%Y%m%d_%H%M%S')}_resource_usage_log.csv"
print(log_file)

In [0]:
# better way to store resource log file
dynamic_train(train_dataset, epochs=20, base_batch_size=32, log_file=log_file, dynamic_adjustments=True)

In [0]:
# Test with dynamic adjustments on
print("Testing with dynamic adjustments ON:")
test_dynamic_training_mirrored(dynamic_adjustments=True, epochs=5)

# Test with dynamic adjustments off (for comparison)
print("\nTesting with dynamic adjustments OFF:")
test_dynamic_training_mirrored(dynamic_adjustments=False, epochs=5)

# Visualize resource usage using log_usage_plot
log_usage_plot("resource_log.csv")


In [0]:
base_batch_size = 32
epochs = 20

# Log resource usage (regardless of dynamic adjustments)
log_usage_once(log_file, batch_size = base_batch_size, num_workers=2, num_epoch=0)  # Default workers are 2 for logging

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    
    # if dynamic_adjustments:
    # Adjust resources dynamically based on system usage
    cpu_percent, gpu_memory_usage, gpu_memory_total, gpu_percent = sys_resources()
    num_workers = adjust_workers(cpu_threshold=80, gpu_threshold=80)  # Adjust workers based on resources
    batch_size = adjust_batch_size(cpu_percent, gpu_percent, base_batch_size)  # Adjust batch size
    # else:
    #     # Keep default batch size and workers fixed
    #     num_workers = 2  # Default number of workers
    #     batch_size = base_batch_size  # Default batch size
    
    # print(f"Using {num_workers} workers and batch size {batch_size}")
    
    # Create the MirroredStrategy for distributed training
    strategy = tf.distribute.MirroredStrategy()
    train_images, train_labels = train_dataset['images'], train_dataset['labels']
    input_shape = train_images[0].shape
    with strategy.scope():
        model = create_model_tf(input_shape=input_shape)
        model.fit(train_images, train_labels, batch_size=32, epochs=1)  # Train for 1 epoch at a time
        model.fit(train_images, train_labels, batch_size=base_batch_size, epochs=1)  # Train for 1 epoch at a time

    # Log resource usage for the current epoch
    log_usage_once(log_file, batch_size, num_workers, num_epoch=epoch)
