<a href="https://colab.research.google.com/github/DavoodSZ1993/RL/blob/main/09_DNQ.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Improt Dependencies

In [1]:
import random
import gym
import numpy as np

from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from tensorflow.keras.optimizers import Adam

import os

### Set Parameters

In [2]:
env = gym.make('CartPole-v0')

batch_size = 32
n_episodes = 1001

output_dir = './cartpole'

state_size = env.observation_space.shape[0] # States: Cart Location, Cart Velocity, Pole Location, Pole Angular Velocity
action_size = env.action_space.n            # Actions: Left, Right

state_size, action_size

  f"The environment {id} is out of date. You should consider "
  "Initializing wrapper in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."
  "Initializing environment in old step API which returns one bool instead of two. It is recommended to set `new_step_api=True` to use new step API. This will be the default behaviour in future."


(4, 2)

In [None]:
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

### Defining the Agent

In [None]:
class DQNAgent:
  def __init__(self, state_size, action_size):
    self.state_size = state_size
    self.action_size = action_size

    self.memory = deque(maxlen=200)

    self.gamma = 0.95

    self.epsilon = 1.0
    self.epsilon_decay = 0.995
    self.epsilon_min = 0.01

    self.learning_rate = 0.01

    self.model = self._build_model()

  def _build_model(self):
    model = Sequential()

    model.add(Dense(24, input_dim=self.state_size, activation='relu'))
    model.add(Dense(24, activation='relu'))
    model.add(Dense(self.action_size, activation='linear'))

    model.compile(loss='mse', optimizer=Adam(lr=self.learning_rate))

    return model

  def remember(self, state, action, reward, next_state, done):
    self.memory.append((state, action, reward, next_state, done))

  def act(self, state):
    if np.random.rand() <= self.epsilon:                                         # Exploration at the start of the algorithm
      return random.randrange(self.action_size)
    act_values = self.model.predict(state)
    return np.argmax(act_values[0])

  def replay(self, batch_size):

    minibatch = random.sample(self.memory, batch_size)

    for state, action, reward, next_state, done in minibatch:
      target = reward
      if not done:
        target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))  # np.amax returns the maximum of an array or maximum along an axis && Q-Learning
        target_f = self.model.predict(state)
        target_f[0][action] = target

        self.model.fit(state, target_f, epochs=1, verbose=0)

    if self.epsilon > self.epsilon_min:
      self.epsilon *= self.epsilon_decay

  def load(self, name):
      self.model.load_weights(name)

  def save(self, name):
      self.model.save_weights(name)


In [None]:
agent = DQNAgent(state_size, action_size)

  super(Adam, self).__init__(name, **kwargs)


### Interact with environment

In [None]:
done = False
for e in range(n_episodes):

  state = env.reset()
  state = np.reshape(state, [1, state_size])

  for time in range(5000):

    action = agent.act(state)
    next_state, reward, done, _ = env.step(action)

    reward = reward if not done else -10

    next_state = np.reshape(next_state, [1, state_size])

    agent.remember(state, action, reward, next_state, done)

    state = next_state

    if done:
      print("episode: {}/{}, score: {}, e: {:.2}".format(e, n_episodes, time, agent.epsilon))
      break

  if len(agent.memory) > batch_size:
    agent.replay(batch_size)

  if e % 50 == 0:
    agent.save(output_dir + 'weights_' + '{:04d}'.format(e) + ".hdf5")



episode: 0/1001, score: 19, e: 1.0
episode: 1/1001, score: 24, e: 1.0
episode: 2/1001, score: 15, e: 0.99
episode: 3/1001, score: 35, e: 0.99
episode: 4/1001, score: 9, e: 0.99
episode: 5/1001, score: 13, e: 0.98
episode: 6/1001, score: 37, e: 0.98
episode: 7/1001, score: 13, e: 0.97
episode: 8/1001, score: 43, e: 0.97
episode: 9/1001, score: 12, e: 0.96
episode: 10/1001, score: 18, e: 0.96
episode: 11/1001, score: 35, e: 0.95
episode: 12/1001, score: 20, e: 0.95
episode: 13/1001, score: 40, e: 0.94
episode: 14/1001, score: 12, e: 0.94
episode: 15/1001, score: 11, e: 0.93
episode: 16/1001, score: 13, e: 0.93
episode: 17/1001, score: 54, e: 0.92
episode: 18/1001, score: 12, e: 0.92
episode: 19/1001, score: 28, e: 0.91
episode: 20/1001, score: 11, e: 0.91
episode: 21/1001, score: 22, e: 0.9
episode: 22/1001, score: 10, e: 0.9
episode: 23/1001, score: 12, e: 0.9
episode: 24/1001, score: 16, e: 0.89
episode: 25/1001, score: 8, e: 0.89
episode: 26/1001, score: 15, e: 0.88
episode: 27/1001, 

KeyboardInterrupt: ignored