<a href="https://colab.research.google.com/github/ArshT/Reinforcement_Learning_Basic/blob/master/DDPG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip3 install box2d-py
import gym
env = gym.make("LunarLander-v2")

Collecting box2d-py
[?25l  Downloading https://files.pythonhosted.org/packages/87/34/da5393985c3ff9a76351df6127c275dcb5749ae0abbe8d5210f06d97405d/box2d_py-2.3.8-cp37-cp37m-manylinux1_x86_64.whl (448kB)
[K     |▊                               | 10kB 14.2MB/s eta 0:00:01[K     |█▌                              | 20kB 20.0MB/s eta 0:00:01[K     |██▏                             | 30kB 11.4MB/s eta 0:00:01[K     |███                             | 40kB 8.7MB/s eta 0:00:01[K     |███▋                            | 51kB 5.5MB/s eta 0:00:01[K     |████▍                           | 61kB 6.4MB/s eta 0:00:01[K     |█████▏                          | 71kB 6.0MB/s eta 0:00:01[K     |█████▉                          | 81kB 6.3MB/s eta 0:00:01[K     |██████▋                         | 92kB 6.2MB/s eta 0:00:01[K     |███████▎                        | 102kB 6.7MB/s eta 0:00:01[K     |████████                        | 112kB 6.7MB/s eta 0:00:01[K     |████████▊                       | 

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import torch as T
from torch.distributions import MultivariateNormal

class OUActionNoise(object):
    def __init__(self, mu, sigma=0.15, theta=.2, dt=1e-2, x0=None):
        self.theta = theta
        self.mu = mu
        self.sigma = sigma
        self.dt = dt
        self.x0 = x0
        self.reset()

    def __call__(self):
        x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + \
            self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
        self.x_prev = x
        return x

    def reset(self):
        self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)

    def __repr__(self):
        return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(
                                                            self.mu, self.sigma)



class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, input_shape))
        self.new_state_memory = np.zeros((self.mem_size, input_shape))
        self.action_memory = np.zeros((self.mem_size, n_actions))
        self.reward_memory = np.zeros((self.mem_size,1))
        self.terminal_memory = np.zeros((self.mem_size,1), dtype=np.float32)

    def store_transition(self, state, action, reward, state_, done):
        index = self.mem_cntr % self.mem_size
        self.state_memory[index] = state
        self.new_state_memory[index] = state_
        self.action_memory[index] = action
        self.reward_memory[index] = reward
        self.terminal_memory[index] = 1 - done
        self.mem_cntr += 1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)

        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        actions = self.action_memory[batch]
        rewards = self.reward_memory[batch]
        states_ = self.new_state_memory[batch]
        terminal = self.terminal_memory[batch]

        return states, actions, rewards, states_, terminal


class Actor(nn.Module):
  def __init__(self,alpha,input_dims,fc1_dims,fc2_dims,action_dims):
    super(Actor,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    f3 = 0.003
    self.mu = nn.Linear(fc2_dims, action_dims)
    torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

    self.device = torch.device('cuda' if T.cuda.is_available() else 'cpu')
    self.optimizer = optim.Adam(self.parameters(), lr=alpha)

    self.to(self.device)
  
  def forward(self,state):
    x = F.relu(self.bn1(self.fc1(state)))
    x = F.relu(self.bn2(self.fc2(x)))
    action_mu = torch.tanh(self.mu(x))

    return action_mu


class Critic(nn.Module):
  def __init__(self,beta,input_dims,fc1_dims,fc2_dims,action_dims):
    super(Critic,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    self.action_value = nn.Linear(action_dims, fc2_dims)

    f3 = 0.0003
    self.q = nn.Linear(fc2_dims, 1)
    torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.q.bias.data, -f3, f3)

    self.device = torch.device('cuda' if T.cuda.is_available() else 'cpu')
    self.optimizer = optim.Adam(self.parameters(), lr=beta)

    self.to(self.device)
  
  def forward(self,state,action):
    state_value = F.relu(self.bn1(self.fc1(state)))
    state_value = self.bn2(self.fc2(state_value))

    action_value = F.relu(self.action_value(action))

    state_action_value = F.relu(torch.add(state_value,action_value))
    state_action_value = self.q(state_action_value)

    return state_action_value

class Agent(object):
    def __init__(self, alpha, beta, input_dims, tau, env,action_std_decay_rate,
                 min_action_std, gamma=0.99,n_actions=2, max_size=1000000, layer1_size=400,
                 layer2_size=300, batch_size=64,action_std_init=0.6):
      
      self.tau = tau
      self.gamma = gamma
      self.batch_size = batch_size
      self.memory = ReplayBuffer(max_size, input_dims, n_actions)
      self.env = env
      self.action_dims=n_actions
      self.action_std_decay_rate = action_std_decay_rate
      self.min_action_std = min_action_std

      self.action_std = action_std_init
      self.action_var = torch.full((n_actions,), self.action_std * self.action_std).to('cpu')

      self.actor = Actor(alpha=alpha,input_dims=input_dims,fc1_dims=layer1_size,fc2_dims=layer2_size,action_dims=n_actions)
      self.target_actor = Actor(alpha=alpha,input_dims=input_dims,fc1_dims=layer1_size,fc2_dims=layer2_size,action_dims=n_actions)

      self.critic = Critic(beta=beta,input_dims=input_dims,fc1_dims=layer1_size,fc2_dims=layer2_size,action_dims=n_actions)
      self.target_critic = Critic(beta=beta,input_dims=input_dims,fc1_dims=layer1_size,fc2_dims=layer2_size,action_dims=n_actions)

      self.noise = OUActionNoise(mu=np.zeros(n_actions))
      self.update_network_parameters(tau=1)
    
    def set_action_std(self, new_action_std):
      self.action_std = new_action_std
      self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to('cpu')
    

    def decay_action_std(self):
        print("--------------------------------------------------------------------------------------------")
        
        self.action_std = self.action_std - self.action_std_decay_rate
        self.action_std = round(self.action_std, 4)
        if (self.action_std <= self.min_action_std):
          self.action_std = self.min_action_std
          print("setting actor output action_std to min_action_std : ", self.action_std)
          self.set_action_std(self.action_std)
        
        else:
          print("setting actor output action_std to : ", self.action_std)
          self.set_action_std(self.action_std)

        print("--------------------------------------------------------------------------------------------")


    
    def remember(self,state,action,reward,new_state,done):
      self.memory.store_transition(state, action, reward, new_state, done)
    

    def choose_action(self,observation):
      self.actor.eval()
      with torch.no_grad():
        state = torch.FloatTensor(observation).to(self.actor.device)
        action_mean = self.actor.forward(state).to(self.actor.device)
        cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
        dist = MultivariateNormal(action_mean, cov_mat)
      
      action = dist.sample()
      low = self.env.action_space.low[0]
      high = self.env.action_space.high[0]
      action = torch.clamp(action,low,high)

      self.actor.train()
      
      return action.detach().cpu().numpy().flatten()
    

    def learn(self):
      if self.memory.mem_cntr < self.batch_size:
        return
      state, action, reward, new_state, done = self.memory.sample_buffer(self.batch_size)
      
      reward = torch.tensor(reward, dtype=torch.float).to(self.critic.device)
      done = torch.tensor(done).to(self.critic.device)
      new_state = torch.tensor(new_state, dtype=torch.float).to(self.critic.device)
      action = torch.tensor(action, dtype=torch.float).to(self.critic.device)
      state = torch.tensor(state, dtype=torch.float).to(self.critic.device)
      
      self.target_actor.eval()
      self.target_critic.eval()
      self.critic.eval()
      target_actions = self.target_actor.forward(new_state)
      critic_value_ = self.target_critic.forward(new_state, target_actions)
      critic_value = self.critic.forward(state, action)


      #target = []
      #for j in range(self.batch_size):
       # target.append(reward[j] + self.gamma*critic_value_[j]*done[j])
      target = reward + self.gamma*critic_value_*done
      target = torch.tensor(target).to(self.critic.device)
      target = target.view(self.batch_size, 1)

      self.critic.train()
      self.critic.optimizer.zero_grad()
      critic_loss = F.mse_loss(target, critic_value)
      critic_loss.backward()
      self.critic.optimizer.step()

      self.critic.eval()
      self.actor.optimizer.zero_grad()
      mu = self.actor.forward(state)
      self.actor.train()
      actor_loss = -self.critic.forward(state, mu)
      actor_loss = torch.mean(actor_loss)
      actor_loss.backward()
      self.actor.optimizer.step()

      self.update_network_parameters()

    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)

In [None]:

import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')
agent = Agent(alpha=0.000025, beta=00.000025, input_dims=8, tau=0.001, env=env,action_std_decay_rate=0.05,
                 min_action_std=0.05,batch_size=64,  layer1_size=400, layer2_size=300, n_actions=2,action_std_init=0.6)

action_std_decay_freq = int(7.5e4) 
#action_std_decay_freq = int(2000) 

score_history = []
timesteps = 0
for i in range(5000):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        timesteps += 1
        
        if timesteps % action_std_decay_freq == 0:
            agent.decay_action_std()
            
        if done:
            break
        #env.render()
    score_history.append(score)

    if i % 10 == 0:
        print('episode ', i, 'score %.2f' % score,
          'trailing 10 games avg %.3f' % np.mean(score_history[-10:]),
          'timesteps:',timesteps)

    print('episode ', i, 'score %.2f' % score,
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]),
          'timesteps:',timesteps)





episode  0 score -293.41 trailing 10 games avg -293.411 timesteps: 112
episode  0 score -293.41 trailing 100 games avg -293.411 timesteps: 112
episode  1 score -246.05 trailing 100 games avg -269.731 timesteps: 215
episode  2 score -420.49 trailing 100 games avg -319.985 timesteps: 329
episode  3 score -240.22 trailing 100 games avg -300.044 timesteps: 446
episode  4 score -275.83 trailing 100 games avg -295.201 timesteps: 558
episode  5 score -206.15 trailing 100 games avg -280.359 timesteps: 739
episode  6 score -348.58 trailing 100 games avg -290.105 timesteps: 926
episode  7 score -240.98 trailing 100 games avg -283.964 timesteps: 1077
episode  8 score -244.96 trailing 100 games avg -279.630 timesteps: 1292
episode  9 score -202.92 trailing 100 games avg -271.959 timesteps: 1503
episode  10 score -371.49 trailing 10 games avg -279.767 timesteps: 1690
episode  10 score -371.49 trailing 100 games avg -281.007 timesteps: 1690
episode  11 score -242.52 trailing 100 games avg -277.800 t

KeyboardInterrupt: ignored

In [2]:
####Mine

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as Optim
import numpy as np
from torch.distributions import MultivariateNormal

class Actor(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Actor,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    f3 = 0.003
    self.mu = nn.Linear(fc2_dims,action_dims)
    torch.nn.init.uniform_(self.mu.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.mu.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state):

    x = F.relu(self.bn1(self.fc1(state)))
    x = F.relu(self.bn2(self.fc2(x)))

    action_mu = torch.tanh(self.mu(x))

    return action_mu


class Critic(nn.Module):
  def __init__(self,input_dims,fc1_dims,fc2_dims,action_dims,device):
    super(Critic,self).__init__()

    self.fc1 = nn.Linear(input_dims,fc1_dims)
    f1 = 1 / np.sqrt(self.fc1.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
    torch.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
    self.bn1 = nn.LayerNorm(fc1_dims)

    self.fc2 = nn.Linear(fc1_dims,fc2_dims)
    f2 = 1 / np.sqrt(self.fc2.weight.data.size()[0])
    torch.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
    torch.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
    self.bn2 = nn.LayerNorm(fc2_dims)

    self.action_value_layer = nn.Linear(action_dims,fc2_dims)

    f3 = 0.003
    self.q = nn.Linear(fc2_dims,1)
    torch.nn.init.uniform_(self.q.weight.data, -f3, f3)
    torch.nn.init.uniform_(self.q.bias.data, -f3, f3)

    self.device = device
    self.to(self.device)
  
  def forward(self,state,action):

    state_value = F.relu(self.bn1(self.fc1(state)))
    state_value = self.bn2(self.fc2(state_value))

    action_value = F.relu(self.action_value_layer(action))

    state_action_value = F.relu(torch.add(state_value,action_value))
    state_action_value = self.q(state_action_value)

    return state_action_value
  

class Agent:
  def __init__(self,alpha,beta,input_dims,fc1_dims,fc2_dims,action_dims,tau,env,action_std_decay_rate,device,
                 min_action_std, gamma=0.99,max_size=1000000,batch_size=64,action_std_init=0.6):
    
    self.action_dims = action_dims
    self.gamma = gamma
    self.tau = tau
    self.env = env
    self.batch_size = batch_size
    self.device = device

    self.action_dims=action_dims
    self.action_std_decay_rate = action_std_decay_rate
    self.min_action_std = min_action_std

    self.actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_actor = Actor(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)
    self.target_critic = Critic(input_dims=input_dims,fc1_dims=fc1_dims,fc2_dims=fc2_dims,action_dims=action_dims,device=device)

    self.critic_optimizer = Optim.Adam(self.critic.parameters(),lr=beta)
    self.actor_optimizer = Optim.Adam(self.actor.parameters(),lr=alpha)

    self.action_std = action_std_init
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(device)

    self.mem_size = max_size
    self.mem_cntr = 0
    self.state_memory = np.zeros((self.mem_size, input_dims))
    self.new_state_memory = np.zeros((self.mem_size, input_dims))
    self.action_memory = np.zeros((self.mem_size,action_dims))
    self.reward_memory = np.zeros((self.mem_size,1))
    self.terminal_memory = np.zeros((self.mem_size,1), dtype=np.float32)

    self.update_network_parameters(tau=1)

  
  def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau

        actor_params = self.actor.named_parameters()
        critic_params = self.critic.named_parameters()
        target_actor_params = self.target_actor.named_parameters()
        target_critic_params = self.target_critic.named_parameters()

        critic_state_dict = dict(critic_params)
        actor_state_dict = dict(actor_params)
        target_critic_dict = dict(target_critic_params)
        target_actor_dict = dict(target_actor_params)

        for name in critic_state_dict:
            critic_state_dict[name] = tau*critic_state_dict[name].clone() + \
                                      (1-tau)*target_critic_dict[name].clone()

        self.target_critic.load_state_dict(critic_state_dict)

        for name in actor_state_dict:
            actor_state_dict[name] = tau*actor_state_dict[name].clone() + \
                                      (1-tau)*target_actor_dict[name].clone()
        self.target_actor.load_state_dict(actor_state_dict)


  
  
  def set_action_std(self, new_action_std):
    self.action_std = new_action_std
    self.action_var = torch.full((self.action_dims,), self.action_std * self.action_std).to(self.device)

  
  
  def decay_action_std(self):
    print("--------------------------------------------------------------------------------------------")
    
    self.action_std = self.action_std - self.action_std_decay_rate
    self.action_std = round(self.action_std, 4)
    if (self.action_std <= self.min_action_std):
      self.action_std = self.min_action_std
      print("setting actor output action_std to min_action_std : ", self.action_std)
      self.set_action_std(self.action_std)
    else:
      print("setting actor output action_std to : ", self.action_std)
      self.set_action_std(self.action_std)
    
    print("--------------------------------------------------------------------------------------------")

  
  def remember(self,state,action,reward,new_state,done):
    index = self.mem_cntr % self.mem_size
    self.state_memory[index] = state
    self.action_memory[index] = action
    self.reward_memory[index] = reward
    self.new_state_memory[index] = new_state
    self.terminal_memory[index] = float(1- done)
    self.mem_cntr += 1
  
  
  def choose_action(self,observation):
    self.actor.eval()
    with torch.no_grad():
      state = torch.FloatTensor(observation).to(self.actor.device)
      action_mean = self.actor.forward(state).to(self.actor.device)
      cov_mat = torch.diag(self.action_var).unsqueeze(dim=0)
      dist = MultivariateNormal(action_mean, cov_mat)
      
    action = dist.sample()
    low = self.env.action_space.low[0]
    high = self.env.action_space.high[0]
    action = torch.clamp(action,low,high)
    
    self.actor.train()
      
    return action.detach().cpu().numpy().flatten()
  

  def learn(self):

    if self.mem_cntr < self.batch_size:
      return
    else:
      max_mem = min(self.mem_cntr, self.mem_size)

      batch = np.random.choice(max_mem, self.batch_size)

      states = self.state_memory[batch]
      actions = self.action_memory[batch]
      rewards = self.reward_memory[batch]
      next_states = self.new_state_memory[batch]
      terminals = self.terminal_memory[batch]

      rewards = torch.tensor(rewards, dtype=torch.float).to(self.critic.device)
      terminals = torch.tensor(terminals).to(self.critic.device)
      next_states = torch.tensor(next_states, dtype=torch.float).to(self.critic.device)
      actions = torch.tensor(actions, dtype=torch.float).to(self.critic.device)
      states = torch.tensor(states, dtype=torch.float).to(self.critic.device)

      self.target_actor.eval()
      self.target_critic.eval()
      self.critic.eval()

      next_actions = self.target_actor.forward(next_states)
      next_critic_value = self.target_critic.forward(next_states,next_actions)
      critic_value = self.critic.forward(states,actions)

      targets = rewards + self.gamma*next_critic_value*terminals
      targets = torch.tensor(targets).to(self.critic.device)
      targets = targets.view(self.batch_size, 1)

      self.critic.train()
      self.critic_optimizer.zero_grad()
      critic_loss = F.mse_loss(targets, critic_value)
      critic_loss.backward()
      self.critic_optimizer.step()

      self.critic.eval()
      mu = self.actor.forward(states)
      self.actor.train()
      actor_loss = -self.critic.forward(states,mu)
      self.actor_optimizer.zero_grad()
      actor_loss.mean().backward()
      self.actor_optimizer.step()

      self.update_network_parameters()

In [4]:

import gym
import numpy as np

env = gym.make('LunarLanderContinuous-v2')
agent = Agent(alpha=0.000025, beta=00.000025, input_dims=8, tau=0.001, env=env,action_std_decay_rate=0.05,gamma=0.99,
                 min_action_std=0.05,batch_size=64, fc1_dims=400, fc2_dims=300, action_dims=2,action_std_init=0.6,device='cpu')

action_std_decay_freq = int(7.5e4) 
#action_std_decay_freq = int(2000) 

score_history = []
timesteps = 0
for i in range(5000):
    obs = env.reset()
    done = False
    score = 0
    while not done:
        act = agent.choose_action(obs)
        new_state, reward, done, info = env.step(act)
        agent.remember(obs, act, reward, new_state, int(done))
        agent.learn()
        score += reward
        obs = new_state
        timesteps += 1
        
        if timesteps % action_std_decay_freq == 0:
            agent.decay_action_std()
            
        if done:
            break
        #env.render()
    score_history.append(score)

    if i % 10 == 0:
        print('episode ', i, 'score %.2f' % score,
          'trailing 10 games avg %.3f' % np.mean(score_history[-10:]),
          'trailing 100 games avg %.3f' % np.mean(score_history[-100:]),
          'timesteps:',timesteps)
    else:
        print('episode ', i, 'score %.2f' % score,
          'timesteps:',timesteps)
      
    avg_100 = np.mean(score_history[-100:])
    if avg_100 >= 210:
      print("####SOLVED!!####")
      break


avg_score = 0
print("")
for i in range(50):
  obs = env.reset()
  done = False
  score = 0
  while not done:
    act = agent.choose_action(obs)
    new_state, reward, done, info = env.step(act)
    obs = new_state
    score += reward
  avg_score += score
  print("Episode Reward:",score)

print("")
print(avg_score/50)
  
    




episode  0 score -294.15 trailing 10 games avg -294.154 trailing 100 games avg -294.154 timesteps: 101
episode  1 score -195.61 timesteps: 185
episode  2 score -262.57 timesteps: 267
episode  3 score -181.31 timesteps: 328
episode  4 score -114.74 timesteps: 391
episode  5 score -98.45 timesteps: 471
episode  6 score -151.15 timesteps: 525
episode  7 score -384.76 timesteps: 607
episode  8 score -113.63 timesteps: 662
episode  9 score -191.93 timesteps: 743
episode  10 score -134.88 trailing 10 games avg -182.904 trailing 100 games avg -193.018 timesteps: 810
episode  11 score -274.53 timesteps: 901
episode  12 score -392.81 timesteps: 1041
episode  13 score -398.20 timesteps: 1156
episode  14 score -250.20 timesteps: 1262
episode  15 score -218.76 timesteps: 1403
episode  16 score -290.89 timesteps: 1616
episode  17 score -152.73 timesteps: 1752
episode  18 score -361.34 timesteps: 1861
episode  19 score -248.24 timesteps: 1972
episode  20 score -307.17 trailing 10 games avg -289.486 