In [1]:
import tensorflow as tf
import tensorrt

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
print("GPU available:", tf.config.list_physical_devices('GPU'))

2023-08-02 16:49:11.930825: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Num GPUs Available:  4
GPU available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


In [2]:
import numpy as np
import gym
from gym import spaces
import random

class ConsumptionSavingsEnv(gym.Env):
    def __init__(self):
        super(ConsumptionSavingsEnv, self).__init__()

        # Environment parameters
        self.T = 65 
        self.r = 0.05
        self.risk_aversion = 2
        self.alpha = 0.5
        
        # Define observation space (state space)
        self.observation_space = spaces.Tuple((
            spaces.Discrete(self.T+1),        # Time step t, integer from 0 to 65
            spaces.Box(low=-1, high=10, shape=(1,))  # Wealth level a_t, a continuous value between -1 and 10
        ))
        
        # Define action space
        self.action_space = spaces.Discrete(220)  # Discrete actions from 0 to 219
        
        # Initialize the state at time t=0 with wealth a_t=0
        self.state = (0,0)

    def step(self, a_t1_index):
        # Unpack the current state (time t, wealth a_t)
        t, a_t = self.state
        
        # Convert the action index (0 to 219) to an actual action value between -1 and 1
        a_t1 = (a_t1_index - 20) / 20
        
        # Initialize the reward at the beginning of each step
        reward = 0
        
        # Calculate the time-dependent y_t value using the alpha parameter
        y_t = 1 / (1 + np.exp(-self.alpha * t)) if t <= 65 else 0
        
        # Calculate the maximum feasible wealth at time t+1, given the current wealth a_t
        max_feasible_a_t1 = (1 + self.r) * a_t + y_t 
        
        # Check if the action leads to an infeasible wealth value, and penalize if it does
        if a_t1 > max_feasible_a_t1:
            a_t1 = max_feasible_a_t1
            reward = -1e3
        
        # Calculate the consumption at time t
        c_t = (1+self.r)*a_t + y_t - a_t1
        
        # Calculate the reward based on the consumption and risk aversion parameters
        if c_t <= 0:
            reward = -1e3
        elif self.risk_aversion == 1:
            reward = np.log(c_t)
        else:
            reward = (c_t**(1 - self.risk_aversion)-1) / (1 - self.risk_aversion)

        # Check if the episode is done (time step reaches the final T=65)
        done = t == self.T
        # If the episode is done, add the final wealth to the reward
        if done:
            reward += a_t1

        # Update the state for the next time step (t+1) and return the updated state, reward, done flag, and an empty dictionary
        self.state = (t+1, a_t1)
        return np.array(self.state), reward, done, {}

    def reset(self):
        # Reset the state to the initial state (t=0, a_t=0) at the beginning of each episode
        self.state = (0, 0)
        return np.array(self.state)

    def render(self, mode='human'):
        # Print the current time step and wealth for visualization
        print(f"Time: {self.state[0]}, Wealth: {self.state[1]}")

In [3]:
from collections import deque
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import clone_model
import numpy as np
import random

class DQNAgent:
    def __init__(self, state_size, action_size, memory_size=1000, tau=0.001):
        self.state_size = state_size
        self.action_size = action_size

        # Experience replay memory to store past experiences
        self.memory = deque(maxlen=memory_size)

        # Recent scores deque to track the last 100 episode scores
        self.recent_scores = deque(maxlen=100)

        # Discount factor (gamma) for the Bellman equation
        self.gamma = np.exp(-0.04)

        # Exploration-exploitation trade-off (epsilon-greedy) parameters
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.95

        # Learning rate for the optimizer
        self.learning_rate = 0.01

        # Soft update rate for the target model
        self.tau = tau

        # Create the main and target neural network models
        self.model = self._build_model()
        self.target_model = clone_model(self.model)
        self.target_model.set_weights(self.model.get_weights())

    def _build_model(self):
        # Build a simple neural network model using Keras
        model = Sequential()
        model.add(Dense(5, input_dim=self.state_size, activation='relu'))
        model.add(Dense(5, activation='relu'))
        model.add(Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=Adam(learning_rate=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done, score):
        # Store the experience tuple (state, action, reward, next_state, done) in the replay memory
        self.memory.append((state, action, reward, next_state, done))

        # Keep track of the recent episode scores
        self.recent_scores.append(score)

    def adapt_epsilon(self):
        # Adapt the epsilon value based on the standard deviation of recent scores
        if len(self.recent_scores) == 100:
            std_dev = np.std(self.recent_scores)
            if std_dev > 0.05:
                self.epsilon = min(self.epsilon * 1.01, 1.0)
            else:
                self.epsilon = max(self.epsilon * 0.99, self.epsilon_min)

    def act(self, state):
        # Epsilon-greedy policy: Explore (random action) or Exploit (best predicted action)
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state, verbose=0)
        # Choose randomly among the actions with the maximum Q-value
        max_value = np.max(act_values[0])
        max_indices = [i for i, v in enumerate(act_values[0]) if v == max_value]
        return np.random.choice(max_indices)

    def replay(self, batch_size):
        # Experience replay to learn from past experiences

        # Sample a mini-batch from the replay memory
        minibatch = random.sample(self.memory, batch_size)

        for state, action, reward, next_state, done in minibatch:
            target = self.target_model.predict(state, verbose=0)

            if done:
                # If the episode is done, the target value is just the immediate reward
                target[0][action] = reward
            else:
                # If the episode is not done, update the target value using the Bellman equation
                Q_future = max(self.target_model.predict(next_state, verbose=0)[0])
                target[0][action] = reward + Q_future * self.gamma

            # Perform one step of gradient descent to minimize the loss (MSE)
            self.model.fit(state, target, epochs=1, verbose=0)

        # Perform a soft update of the target model's weights using a fraction (tau) of the main model's weights
        self.soft_update(self.model, self.target_model, self.tau)

    def soft_update(self, model, target_model, tau):
        # Update the target model's weights softly based on tau

        model_weights = model.get_weights()
        target_weights = target_model.get_weights()

        for i in range(len(model_weights)):
            target_weights[i] = tau * model_weights[i] + (1 - tau) * target_weights[i]

        target_model.set_weights(target_weights)

    def load(self, name):
        # Load the model's weights from a file
        self.model.load_weights(name)

    def save(self, name):
        # Save the model's weights to a file
        self.model.save_weights(name)

In [4]:
import matplotlib.pyplot as plt

# Define some parameters
EPISODES = 4000
WARMUP_EPISODES = 1000
BATCH_SIZE = 23

# Instantiate environment and agent
env = ConsumptionSavingsEnv()
state_size = 2
action_size = env.action_space.n
agent = DQNAgent(state_size, action_size)

# Lists to store the best rewards and corresponding consumption and savings paths
best_lifetime_rewards = []
best_consumption_paths = []
best_savings_paths = []
episode_rewards = []
rewards_time = []

# Loop over episodes
for e in range(1, EPISODES+1):
    # Reset the environment and get the initial state
    state = env.reset()
    state = np.reshape(state, [1, state_size])
    done = False
    score = 0
    consumption_path = []
    savings_path = []
    
    # Execute the episode until it's done
    while not done:
        # Select an action using the agent's epsilon-greedy policy
        action = agent.act(state)
        
        # Take the selected action and observe the next state, reward, and done flag
        next_state, reward, done, _ = env.step(action)
        next_state = np.reshape(next_state, [1, state_size])
        
        # Calculate consumption and savings for visualization
        consumption = (1+env.r)*state[0][1] + 1/(1 + np.exp(-env.alpha * state[0][0])) - next_state[0][1]
        savings = next_state[0][1]
        consumption_path.append(consumption)
        savings_path.append(savings)
        
        # Update the total score for the episode
        score += reward
        
        # If we are beyond the warm-up episodes, store the experience and perform learning
        if e > WARMUP_EPISODES:
            agent.remember(state, action, reward, next_state, done, score)
            if len(agent.memory) > BATCH_SIZE:
                agent.replay(BATCH_SIZE)
        
        # Move to the next state
        state = next_state

    # Record the total score for the episode
    episode_rewards.append(score)
    
    # Adapt the epsilon value based on recent scores after the warm-up period
    agent.adapt_epsilon()
    
    # Save reward storing over time
    savings_path.append(max(episode_rewards))
    
    # Print episode information every 100 episodes after warm-up
    if (e > WARMUP_EPISODES):
        print("Episode: {}, Total Reward: {}".format(e, score), end='', flush=True)
        

    # Save the best rewards and corresponding consumption and savings paths every 300 episodes after warm-up
    if (e > WARMUP_EPISODES) & (e % 300 == 0):
        print(f"episode: {e}/{EPISODES}, score: {score}")
        best_lifetime_rewards.append(max(episode_rewards))
        best_consumption_paths.append(consumption_path)
        best_savings_paths.append(savings_path)
        # Plot the consumption and savings paths for visualization
        plt.figure(figsize=(10, 6))
        plt.plot(best_consumption_paths[-1], label="Best consumption path")
        plt.plot(best_savings_paths[-1], label="Best savings path")
        plt.title(f"Training episode {e} - Reward {best_lifetime_rewards[-1]}")
        plt.xlabel('Time')
        plt.ylabel('Amount')
        plt.legend()
        plt.savefig(f"training_episode_{e}.png")
        plt.show()

2023-08-02 16:49:33.993364: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 14587 MB memory:  -> device: 0, name: Tesla V100-PCIE-16GB, pci bus id: 0001:00:00.0, compute capability: 7.0
2023-08-02 16:49:33.999884: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 14587 MB memory:  -> device: 1, name: Tesla V100-PCIE-16GB, pci bus id: 0002:00:00.0, compute capability: 7.0
2023-08-02 16:49:34.001198: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 14587 MB memory:  -> device: 2, name: Tesla V100-PCIE-16GB, pci bus id: 0003:00:00.0, compute capability: 7.0
2023-08-02 16:49:34.002498: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1635] Created device /job:localhost/replica:0/task:0/device:GPU:3 with 14587 MB memory:  -> device: 3, name: Tesla V100-PCIE-16GB, pci bus id

InternalError: Graph execution error:

Detected at node 'StatefulPartitionedCall_4' defined at (most recent call last):
    File "/anaconda/envs/tfgpu/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/anaconda/envs/tfgpu/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/traitlets/config/application.py", line 1043, in launch_instance
      app.start()
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 736, in start
      self.io_loop.start()
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/anaconda/envs/tfgpu/lib/python3.10/asyncio/base_events.py", line 603, in run_forever
      self._run_once()
    File "/anaconda/envs/tfgpu/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once
      handle._run()
    File "/anaconda/envs/tfgpu/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 516, in dispatch_queue
      await self.process_one()
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 505, in process_one
      await dispatch(*args)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 412, in dispatch_shell
      await result
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 740, in execute_request
      reply_content = await reply_content
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 546, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_6167/1254737597.py", line 53, in <module>
      agent.replay(BATCH_SIZE)
    File "/tmp/ipykernel_6167/4160052766.py", line 92, in replay
      self.model.fit(state, target, epochs=1, verbose=0)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 65, in error_handler
      return fn(*args, **kwargs)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/engine/training.py", line 1685, in fit
      tmp_logs = self.train_function(iterator)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/engine/training.py", line 1284, in train_function
      return step_function(self, iterator)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/engine/training.py", line 1268, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/engine/training.py", line 1249, in run_step
      outputs = model.train_step(data)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/engine/training.py", line 1054, in train_step
      self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 543, in minimize
      self.apply_gradients(grads_and_vars)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 1174, in apply_gradients
      return super().apply_gradients(grads_and_vars, name=name)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 650, in apply_gradients
      iteration = self._internal_apply_gradients(grads_and_vars)
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 1200, in _internal_apply_gradients
      return tf.__internal__.distribute.interim.maybe_merge_call(
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 1250, in _distributed_apply_gradients_fn
      distribution.extended.update(
    File "/anaconda/envs/tfgpu/lib/python3.10/site-packages/keras/optimizers/optimizer.py", line 1245, in apply_grad_to_update_var
      return self._update_step_xla(grad, var, id(self._var_key(var)))
Node: 'StatefulPartitionedCall_4'
RET_CHECK failure (tensorflow/compiler/xla/service/gpu/gpu_compiler.cc:618) dnn != nullptr 
	 [[{{node StatefulPartitionedCall_4}}]] [Op:__inference_train_function_1141]

In [None]:
#import matplotlib.pyplot as plt

#print(f"episode: {e}/{EPISODES}, score: {score}")
#best_lifetime_rewards.append(max(episode_rewards))
#best_consumption_paths.append(consumption_path)
#best_savings_paths.append(savings_path)
## plot the paths
#plt.figure(figsize=(10, 6))
#plt.plot(best_consumption_paths[-1], label="Best consumption path")
#plt.plot(best_savings_paths[-1], label="Best savings path")
#plt.title(f"Training episode {e} - Reward {best_lifetime_rewards[-1]}")
#plt.xlabel('Time')
#plt.ylabel('Amount')
#plt.legend()
#plt.savefig(f"training_episode_{e}.png")
#plt.show()