# Final Model - SAC

In [1]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
print("TensorFlow version:", tf.__version__)
print("tf.keras available:", hasattr(tf, "keras"))
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

TensorFlow version: 2.10.0
tf.keras available: True
Num GPUs Available:  1


In [None]:
from structure import DMC_structure
import numpy as np
import matplotlib.pyplot as plt
import os 

from DMC_Env import DMC_Env

import logging
from datetime import datetime

from sac import SoftActorCritic, Actor
from replay_buffer import ReplayBuffer

tf.keras.backend.set_floatx('float32')

logging.basicConfig(level='INFO')


As per the format explained above, the DMC chain is initialized below.

In [3]:
DMCarr = [[] for i in range(10)]
            # index, next, func, goal, input (T, P, Keq)
DMCarr[0] = [0, [0, 1], "DMC0", 400, [350, 5, 1]]
DMCarr[1] = [1, [2], "DMC1", 500, [350, 5, 1]]
DMCarr[2] = [2, [3, 5], "DMC2", 500, [350, 5, 1]]
DMCarr[3] = [3, [4], "DMC3", 500, [350, 5, 1]]
DMCarr[4] = [4, [5], "DMC4", 500, [350, 5, 1]]
DMCarr[5] = [5, [], "DMC5", 500, [350, 5, 1]]
DMCarr[6] = [6, [1, 6], "DMC6", 500, [350, 5, 1]]
DMCarr[7] = [7, [3, 7], "DMC7", 500, [350, 5, 1]]
DMCarr[8] = [8, [5, 8], "DMC8", 500, [350, 5, 1]]
DMCarr[9] = [9, [2, 3, 4, 9], "DMC9", 500, [350, 5, 1]]
# DMCarr[2] = [2, [], "Dummy", 0, [0, 0, 0]]

print("DMC array:", DMCarr)
struct = DMC_structure(DMCarr)

DMC array: [[0, [0, 1], 'DMC0', 400, [350, 5, 1]], [1, [2], 'DMC1', 500, [350, 5, 1]], [2, [3, 5], 'DMC2', 500, [350, 5, 1]], [3, [4], 'DMC3', 500, [350, 5, 1]], [4, [5], 'DMC4', 500, [350, 5, 1]], [5, [], 'DMC5', 500, [350, 5, 1]], [6, [1, 6], 'DMC6', 500, [350, 5, 1]], [7, [3, 7], 'DMC7', 500, [350, 5, 1]], [8, [5, 8], 'DMC8', 500, [350, 5, 1]], [9, [2, 3, 4, 9], 'DMC9', 500, [350, 5, 1]]]


Enviornment Setup

In [4]:
args = {
    'seed': 42,
    'render': False,
    'verbose': False,
    'batch_size': 128,
    'epochs': 50,
    'start_steps': 0,
    'model_path': '../data/models/',
    'model_name': f'{str(datetime.utcnow().date())}-{str(datetime.utcnow().time())}',
    'gamma': 0.99,
    'polyak': 0.995,
    'learning_rate': 0.001,
}



In [5]:
# Define DMC environment setup
env = DMC_Env(DMCarr)

state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

replay = ReplayBuffer(state_space, action_space)

log_dir = args['model_path'] + '/logs/' + datetime.utcnow().strftime("%Y%m%d-%H%M%S")
writer = tf.summary.create_file_writer(log_dir)

sac = SoftActorCritic(action_space, writer,
                      learning_rate=args['learning_rate'],
                      gamma=args['gamma'],
                      polyak=args['polyak'])



In [6]:
n = 10 #of episodes to run


In [7]:
episode_rewards = []
global_step = 0
episode = 0
prev_avg_episode_reward = None

# Run for n episodes
for _ in range(n):
    current_state = env.reset()
    step = 1
    episode_reward = 0
    done = False

    while not done:
        if args['render']:
            env.render()

        # Choose action: use fully random action during exploration phase.
        if global_step < args['start_steps']:
            action = env.action_space.sample()
        else:
            action = sac.sample_action(current_state)
            # In case the action is still a scalar, force it into an array.
            if np.isscalar(action):
                action = np.array([action])

        # Step in the environment.
        next_state, reward, done, _ = env.step(action)
        episode_reward += reward
        end = 0 if done else 1

        # Optional logging per step.
        if args.get('verbose', False):
            logging.info(f"Global step: {global_step}")
            logging.info(f"Current state: {current_state}")
            logging.info(f"Action: {action}")
            logging.info(f"Reward: {reward}")
            logging.info(f"Next state: {next_state}")
            logging.info(f"End flag: {end}")

        # Store the transition in replay buffer.
        replay.store(current_state, action, reward, next_state, end)

        current_state = next_state
        step += 1
        global_step += 1

    # Training: only if enough samples are available and the exploration phase is over.
    if replay.total_size > args['batch_size'] and global_step > args['start_steps']:
        for epoch in range(args['epochs']):
            current_states, actions, rewards, next_states, ends = replay.fetch_sample(num_samples=args['batch_size'])
            critic1_loss, critic2_loss, actor_loss, alpha_loss = sac.train(
                current_states, actions, rewards, next_states, ends
            )

            if args.get('verbose', False):
                print(f"Episode {episode}, Global step {global_step}, Epoch {epoch}:",
                      critic1_loss.numpy(), critic2_loss.numpy(),
                      actor_loss.numpy(), f"Episode Reward: {episode_reward}")

            # Increase the training epoch step and update target networks each epoch.
            sac.epoch_step += 1
            sac.update_weights()  # Now uses assign() inside the SAC module.

    # Save model every 100 episodes (adjustable as needed).
    if episode % 1 == 0:
        # Generate a safe timestamp without invalid characters (no colons)
        safe_timestamp = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
        args['model_name'] = safe_timestamp

        # Define the full directory where the model will be saved
        model_dir = args['model_path'] + args['model_name']

        # Create the directory if it doesn't exist
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)

        # Save model weights using the corrected file path
        sac.policy.save_weights(model_dir + '/model.weights.h5')

    # Update reward history and compute the average over the last 100 episodes.
    episode_rewards.append(episode_reward)
    episode += 1
    avg_episode_reward = sum(episode_rewards[-100:]) / len(episode_rewards[-100:])

    # Print the reward and average.
    print(f"Episode {episode} reward: {episode_reward}")
    print(f"Episode {episode} Average episode reward: {avg_episode_reward}")

    # Calculate and print the change in average reward compared to the previous episode.
    if prev_avg_episode_reward is not None:
        change = avg_episode_reward - prev_avg_episode_reward
        print(f"Change in average reward: {change}")
    prev_avg_episode_reward = avg_episode_reward


Episode 1 reward: -597.085516909237
Episode 1 Average episode reward: -597.085516909237
Episode 2 reward: -393.5479818963556
Episode 2 Average episode reward: -495.3167494027963
Change in average reward: 101.76876750644067
Episode 3 reward: -448.81738717805666
Episode 3 Average episode reward: -479.81696199454973
Change in average reward: 15.499787408246561
Episode 4 reward: -369.0162820462625
Episode 4 Average episode reward: -452.1167920074779
Change in average reward: 27.70016998707183
Episode 5 reward: -339.9266594638846
Episode 5 Average episode reward: -429.6787654987593
Change in average reward: 22.43802650871862
Episode 6 reward: -432.84519436106046
Episode 6 Average episode reward: -430.2065036424761
Change in average reward: -0.5277381437168174
Episode 7 reward: -444.0187585979946
Episode 7 Average episode reward: -432.17968292183593
Change in average reward: -1.9731792793598402
Episode 8 reward: -428.936617018208
Episode 8 Average episode reward: -431.7742996838824
Change in

In [None]:
# after training:
save_dir = os.path.join(args['model_path'], args['model_name'])
os.makedirs(save_dir, exist_ok=True)
sac.actor.save_weights(os.path.join(save_dir, "model"))  # ← no extension


## To Do

- how do I actually extract the final correct policy?

Done:
- running on GPU, but push to ICE in worst case. Every loop now takes just 30 seconds.
- Reward function is updated, with minimal performance improvements. It's still absurdly negative - which indicates that the actions picked and initial conditions are terrible.
- testing script shows that the action space is correctly defined - then why is reward so bad? 

SAC3:
- now fixed the scaling in the action space tanh
- fixed the scaling in the reward function to not explode
- the reason why we have average ereward and episode reward is becuase we keep a running average to track how we are doing over time. we do see an update, and we do also see a convergence to a stable solution within 10 episodes. 
- We should figure out a way to print the final model and that's it - recommend a stateless bandit.

Discussion:
- well.. it does converge even if it's terrible. 
- stateless bandit as a "testing version"
- it's in the DMCs.. actually aligned with baseline