# Miniproject 2 - DDPG

In [None]:
import gym
import numpy as np
import torch 

import torch.nn as nn
import torch.optim as optim

import sys

from helpers import *

## Heuristic Policy

In [None]:
class HeuristicPendulumAgent:
    def __init__(self, env, torque):
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.env = env
        # apply a fixed torque
        self.torque = torque
        
    def compute_action(self, state):
        x, y, v = state
        
        if (x < 0): 
            action = np.sign(v)*self.torque # same direction to angular velocity
        else:
            action = (-1)*np.sign(v)*self.torque # opposite direction to angular velocity
        return action

### Training with Random Agent

In [None]:
# Set up 
env = NormalizedEnv(gym.make("Pendulum-v1"))
rand_ag = RandomAgent(norm_env)

In [None]:
arr_reward = []
num_ep = 10

for episode in range(10):
    state, info = env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = rand_ag.compute_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        tot_reward += reward
        
        state = next_state
        
        if truncated:
            arr_reward.append(tot_reward)
            break

print("array of reward :", arr_reward)

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

### Training with Heuristic Agent

In [None]:
# Torque selection
sample = env.action_space.sample()
torque = env.action(sample)
print("torque : ", torque)

In [None]:
heur_ag = HeuristicPendulumAgent(env, torque) 
arr_reward = []
num_ep = 10

for episode in range(10):
    state, info = env.reset()
    tot_reward = 0
    truncated = False

    while not truncated:
        action = heur_ag.compute_action(state)
        next_state, reward, terminated, truncated, info = env.step(action)
        tot_reward += reward

        state = next_state

        if truncated:
            arr_reward.append(tot_reward)
            break

av_reward = sum(arr_reward)/num_ep
print("average cumulative reward :", av_reward)

## Q Network with Heuristic Agent

### Replay Buffer

In [None]:
import random
from collections import namedtuple, deque

Transition = namedtuple('Transition',
                        ('state', 'action', 'reward', 'next_state', 'trunc'))

class ReplayBuffer:
    def __init__(self, max_size):
        self.buffer = deque()
        self.max_size = max_size

    def __len__(self):
        return len(self.buffer)

    def add(self, *args):

        if (len(self) >= self.max_size):
            self.buffer.popleft() 

        self.buffer.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.buffer, batch_size)

### Q Network and update function

In [None]:
class QNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, output_size)
        self.relu = nn.ReLU()

    def forward(self, states, action):
        
        x = torch.cat([states, action], 1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        
        return x
    
def update(batch, critic, criterion, agent, optimizer, gamma):
    # Get tensors from the batch
    state_batch = torch.FloatTensor(batch.state)
    action_batch = torch.FloatTensor(batch.action)
    trunc_batch = torch.FloatTensor(batch.trunc)
    reward_batch = torch.FloatTensor(batch.reward)

    next_state_batch = batch.next_state

    next_action_batch = []

    for next_state in next_state_batch:
        next_state = next_state.tolist()
        next_action = agent.compute_action(next_state)
        next_action_batch.append(next_action.tolist())

    next_state_batch = torch.FloatTensor(batch.next_state)
    next_action_batch = torch.FloatTensor(next_action_batch)
    
    reward_batch = reward_batch.unsqueeze(1)
    trunc_batch = trunc_batch.unsqueeze(1)

    q_next = critic.forward(next_state_batch, next_action_batch)

    with torch.no_grad():
        targets = reward_batch + (1.0 - trunc_batch) * gamma * q_next

    # critic update
    optimizer.zero_grad()
    q_val = critic.forward(state_batch, action_batch)
    critic_loss = criterion(q_val, targets)
    critic_loss.backward() 
    optimizer.step()

    return critic_loss.item()

### Training of the Q Network with the Heuristic Agent

In [None]:
# Set up
env = NormalizedEnv(gym.make("Pendulum-v1"))

# fixed torque 
torque = env.action(env.action_space.sample())
print("The torque value is : ", torque)

agent = HeuristicPendulumAgent(env, torque)

buffer_size = 10000
learning_rate = 1e-4
gamma = 0.99

buffer = ReplayBuffer(buffer_size)
batch_size = 128

num_states = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]
hidden_size = 32 

critic = QNetwork(num_states + num_actions, hidden_size, num_actions)

optimizer = optim.Adam(critic.parameters(), lr=1e-4)
criterion = nn.MSELoss()

critic_losses = []
rewards = []

In [None]:
for episode in range(1000):
    state, info = env.reset()
    
    episode_reward = 0
    episode_critic_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state)
        
        next_state, reward, terminated, trunc, info = env.step(action)

        episode_reward += reward
        step_r +=1

        buffer.add(state, action, reward, next_state, trunc)

        if len(buffer) > batch_size:
            transition = buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            loss = update(batch, critic, criterion, agent, optimizer, gamma)

            episode_critic_loss += loss
            step_l +=1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {} \n".format(episode, episode_reward, episode_critic_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)

av_cumulative_reward = sum(rewards)/1000
print("average cumulative reward : ", av_cumulative_reward)

## Minimal DDPG 

### Policy Network 

In [None]:
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, hidden_size, output_size): 
        super(PolicyNetwork, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size) # accepts the vector states as input (size 3)
        self.layer2 = nn.Linear(hidden_size, hidden_size) 
        self.layer3 = nn.Linear(hidden_size, output_size) # outputs the action (a scalar)
        self.relu = nn.ReLU()
        
    def forward(self, state):
        x = self.layer1(state)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.relu(x)
        x = self.layer3(x)
        x = torch.tanh(x) # output action between -1 and 1 
        
        return x

### Gaussian noise 

In [None]:
class GaussianActionNoise: 
    def __init__(self, sigma):
        self.sigma = sigma
        
    def get_noisy_action(self, action):
        noisy_action = action + self.sigma*torch.randn_like(action)
        noisy_action = torch.clamp(noisy_action, -1, 1)
        return noisy_action

### DDPG Agent

In [None]:
class DDPGAgent:
    def __init__(self, device, env, learning_rate, buffer_size, gamma):
        
        self.device = device
        
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.hidden_size = 32
        
        self.gamma = gamma
        
        #initialize the networks
        self.actor = PolicyNetwork(self.state_size, self.hidden_size, self.action_size).to(self.device)
        self.critic = QNetwork(self.state_size + self.action_size, self.hidden_size, self.action_size).to(self.device)
        
        self.buffer = ReplayBuffer(buffer_size)
        
        self.critic_criterion = nn.MSELoss()
        
        # define optimizers
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)
        
    def compute_action(self, state, noise, deterministic=True): # deterministic regulates whether to add a random noise to the action or not

        state = state.to(self.device)

        self.actor.eval()
        action = self.actor.forward(state)
        self.actor.train()
        action = action.data
        
        if (deterministic):
            action = noise.get_noisy_action(action)
        return action
    
    def update(self, batch):
        # Get tensors from the batch
        state_batch = torch.cat(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        trunc_batch = torch.cat(batch.trunc).to(self.device)
        next_state_batch = torch.cat(batch.next_state).to(self.device)
        
        reward_batch = reward_batch.unsqueeze(1)
        trunc_batch = trunc_batch.unsqueeze(1)
        
        with torch.no_grad():
            next_action_batch = self.actor.forward(next_state_batch)
            q_next = self.critic.forward(next_state_batch, next_action_batch)
            targets = reward_batch + (1.0 - trunc_batch) * self.gamma * q_next
        
        # actor loss
        self.critic_optimizer.zero_grad()
        q_val = self.critic.forward(state_batch, action_batch)
        critic_loss = self.critic_criterion(q_val, targets)
        critic_loss.backward() 
        self.critic_optimizer.step()
        
         # update the networks
        self.actor_optimizer.zero_grad()
        policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()
        policy_loss.backward()
        self.actor_optimizer.step()

        return policy_loss.item(), critic_loss.item()

### Training of the DDPG 

In [None]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4
gamma = 0.99

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma)

sigma = 0.3
noise = GaussianActionNoise(sigma)

critic_losses = []
actor_losses = []
rewards = []

In [None]:
for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)
    
av_cumulative_reward = sum(rewards)/1000
print("The average cumulative reward is : ", av_cumulative_reward)

### Testing of the DDPG

In [None]:
critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

av_cumulative_reward = sum(rewards)/100
print("The average cumulative reward is : ", av_cumulative_reward)

## DDPG with target networks

### DDPG Agent

In [None]:
class DDPGAgent:
    def __init__(self, device, env, learning_rate, buffer_size, gamma, tau):
        
        self.device = device
        
        self.state_size = env.observation_space.shape[0]
        self.action_size = env.action_space.shape[0]
        self.hidden_size = 32
        
        self.gamma = gamma
        self.tau = tau
        
        #initialize the networks
        self.actor = PolicyNetwork(self.state_size, self.hidden_size, self.action_size).to(self.device)
        self.actor_target = PolicyNetwork(self.state_size, self.hidden_size, self.action_size).to(self.device)
        
        self.critic = QNetwork(self.state_size + self.action_size, self.hidden_size, self.action_size).to(self.device)
        self.critic_target = QNetwork(self.state_size + self.action_size, self.hidden_size, self.action_size).to(self.device)
        
        self.buffer = ReplayBuffer(buffer_size)
        
        self.critic_criterion = nn.MSELoss()
        
        # define optimizers
        self.actor_optimizer  = optim.Adam(self.actor.parameters(), lr=learning_rate)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=learning_rate)
        
        #initialize the targets
        for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()):
            target_param.data.copy_(param.data)
            
        for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()):
            target_param.data.copy_(param.data)
        
    def compute_action(self, state, noise, deterministic=True): # deterministic regulates whether to add a random noise to the action or not

        state = state.to(self.device)

        self.actor.eval()
        action = self.actor.forward(state)
        self.actor.train()
        action = action.data
        
        if (deterministic):
            action = noise.get_noisy_action(action)
        return action
    
    def update(self, batch):
        # Get tensors from the batch
        state_batch = torch.cat(batch.state).to(self.device)
        action_batch = torch.cat(batch.action).to(self.device)
        reward_batch = torch.cat(batch.reward).to(self.device)
        trunc_batch = torch.cat(batch.trunc).to(self.device)
        next_state_batch = torch.cat(batch.next_state).to(self.device)
        
        reward_batch = reward_batch.unsqueeze(1)
        trunc_batch = trunc_batch.unsqueeze(1)
        
        with torch.no_grad():
            next_action_batch = self.actor_target.forward(next_state_batch)
            q_next = self.critic_target.forward(next_state_batch, next_action_batch)
            targets = reward_batch + (1.0 - trunc_batch) * self.gamma * q_next
            
        
        # update critic
        self.critic_optimizer.zero_grad()
        q_val = self.critic.forward(state_batch, action_batch)
        critic_loss = self.critic_criterion(q_val, targets)
        critic_loss.backward() 
        self.critic_optimizer.step()
        
         # update actor
        self.actor_optimizer.zero_grad()
        policy_loss = -self.critic.forward(state_batch, self.actor.forward(state_batch)).mean()
        policy_loss.backward()
        self.actor_optimizer.step()
        
        # Update the target networks
        update_target_params(self.actor_target, self.actor, self.tau)
        update_target_params(self.critic_target, self.critic, self.tau)

        return policy_loss.item(), critic_loss.item()
        

def update_target_params(target, source, tau):
    for target_param, param in zip(target.parameters(), source.parameters()):
        target_param.data.copy_(param.data * tau + target_param.data * (1.0 - tau))

### Training of the DDPG agent

In [None]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))


buffer_size = 100000
batch_size = 128

learning_rate = 1e-4

gamma = 0.99
tau_array = np.linspace(0.01, 1, 5)
tau = tau_array[4] #choosing the tau from the array above

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma, tau)

sigma = 0.3
noise = GaussianActionNoise(sigma)

critic_losses = []
actor_losses = []
rewards = []

In [None]:
for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

### Testing of the DDPG agent

In [None]:
critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

print("for tau = ", tau)

av_cumulative_reward = sum(rewards)/100

print("average cumulative reward : ", av_cumulative_reward)

## DDPG Agent with OU noise 

### OU noise

In [1]:
class OUActionNoise: 
    def __init__(self, device, action_space, sigma, theta):
        self.device = device
        
        self.sigma = sigma
        self.theta = theta
        self.action_dim = action_space.shape[0]
        
    def reset(self): 
        self.state = torch.zeros(self.action_dim).to(self.device)
    
    def evolve_state(self, action): 
        x = self.state
        self.state = (1.0 - self.theta)*x + self.sigma*torch.randn_like(action)
        return self.state
    
    def get_noisy_action(self, action):
        ou_noise = self.evolve_state(action)
        noisy_action = action + ou_noise
        noisy_action = torch.clamp(noisy_action, -1, 1)
        return noisy_action

### Training of the DDG agent

In [None]:
# Set up
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

env = NormalizedEnv(gym.make("Pendulum-v1"))

state_space = env.observation_space
action_space = env.action_space

buffer_size = 100000
batch_size = 128

learning_rate = 1e-4

gamma = 0.99
tau = 0.01

sigma = 0.3
theta_array = np.linspace(0, 1, 5)
theta = theta_array[0] #choosing the theta from the array above

agent = DDPGAgent(device, env, learning_rate, buffer_size, gamma, tau)

# initialize the noise
noise = OUActionNoise(device, action_space, sigma, theta)

critic_losses = []
actor_losses = []
rewards = []

In [None]:
for episode in range(1000):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    noise.reset()
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise)
        
        next_state, reward, terminated, trunc, _ = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)

            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 100 == 0) or (episode == 999)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

### Testing of the DDPG

In [None]:
critic_losses = []
actor_losses = []
rewards = []

for episode in range(100):
    state, info = env.reset()
    state = torch.FloatTensor([state]).to(device)
    
    episode_reward = 0
    episode_critic_loss = 0
    episode_actor_loss = 0
    
    step_r = 0
    step_l = 0
    
    trunc = False
    
    while not trunc:
        action = agent.compute_action(state, noise, deterministic=False)
        
        next_state, reward, term, trunc, info = env.step(action.cpu().numpy()[0]) 

        episode_reward += reward 
        step_r += 1

        trunc = torch.FloatTensor([trunc]).to(device)
        reward = torch.FloatTensor([reward]).to(device)
        next_state = torch.FloatTensor([next_state]).to(device)

        agent.buffer.add(state, action, reward, next_state, trunc)

        if len(agent.buffer) > batch_size:
            transition = agent.buffer.sample(batch_size)

            batch = Transition(*zip(*transition))

            aloss, closs = agent.update(batch)
            
            episode_critic_loss += closs
            episode_actor_loss += aloss
            step_l += 1

        state = next_state

        if trunc:
            episode_reward = episode_reward/step_r
            episode_critic_loss = episode_critic_loss/step_l
            episode_actor_loss = episode_actor_loss/step_l
            if ((episode % 10 == 0) or (episode == 99)):
                sys.stdout.write("episode: {}, reward: {}, critic loss: {}, actor loss: {} \n".format(episode, episode_reward, episode_critic_loss, episode_actor_loss))

    rewards.append(episode_reward)
    critic_losses.append(episode_critic_loss)
    actor_losses.append(episode_actor_loss)

print("for tau = ", tau)
print("for theta = ", theta)

av_cumulative_reward = sum(rewards)/100

print("average cumulative reward : ", av_cumulative_reward)