# A2C Model for RLPaint

## Imports

In [9]:
# Full imports
import gym
import cv2

# Partial imports 
from tqdm.notebook import tqdm, trange

# Aliased imports
import tensorflow as tf
import tensorflow.keras as keras
import tensorflow_probability as tfp
import numpy as np
from typing import Any, List, Sequence, Tuple

## Bootstrap

In [4]:
# Remember to export to export "LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$CONDA_PREFIX/lib/"" if using linux
# Drop numa errors in term: "for a in /sys/bus/pci/devices/*; do echo 0 | sudo tee -a $a/numa_node; done"

# Check if we have GPU
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

### Constants

In [7]:
# Define min epsilon so we don't run into inf
# on divisions
EPS = np.finfo(np.float32).eps.item()

# Use Huber loss for crisitic as it's less sensitive to outliers than MSE
HUBER_LOSS = tf.keras.losses.Huber(reduction=tf.keras.losses.Reduction.SUM)

# Define patch size
# Must be odd
PATCH_SIZE = (31, 31)

## Environment Interaction

In [13]:
class Trainer:
    def __init__(self, env: gym.Env, model: keras.Model, max_steps=300) -> None:
        # Make internal copy of params
        self._env = env
        self._model = model
        self._max_steps = max_steps

    # Define aux methods for vectorization
    def _aux_np_step(self, action: np.ndarray) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
        state, reward, done, _ = self._env.step(int(action))

        return (
            state["loc"],
            state["canvas"].astype(np.float32),
            np.array(reward, np.float32),
            np.array(done, np.int32)
        )
    
    def _aux_np_patch(self, canvas: np.ndarray, loc: np.ndarray, size: np.ndarray) -> np.ndarray:
        # Define square patch
        xmin = np.clip(int(loc[0] - np.floor(PATCH_SIZE[0] / 2)), 0, canvas.shape[0])
        xmax = np.clip(int(loc[0] + np.floor(PATCH_SIZE[0] / 2)), 0, canvas.shape[0])
        ymin = np.clip(int(loc[1] - np.floor(PATCH_SIZE[0] / 2)), 0, canvas.shape[1])
        ymax = np.clip(int(loc[1] + np.floor(PATCH_SIZE[0] / 2)), 0, canvas.shape[1])

        # Get patch
        img = canvas[xmin:xmax, ymin:ymax]
        padding = [
            # Top, bottom
            (0, PATCH_SIZE[1] - img.shape[1]),
            # Right, left
            (0, PATCH_SIZE[0] - img.shape[0])
        ]

        # Fill with 1s
        return np.pad(img, padding, mode="constant", constant_values=1)
    
    def _tf_step(self, action: tf.Tensor) -> List[tf.Tensor]:
        return tf.numpy_function(self._aux_np_step, [action], [tf.int32, tf.float32, tf.float32, tf.float32])
    
    def _tf_patch(self, canvas: tf.Tensor, loc: tf.Tensor, size: tf.Tensor):
        return tf.numpy_function(self._aux_np_patch, [canvas, loc, size], [tf.Tensor])

    def run_episode():
        log_probs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)
        values = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)
        rewards = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True, clear_after_read=False)

        # Init agent and get state


