# Importing Dependencies

In [1]:
import gym
import torch
import numpy as np
from torch import nn
import random
import torch.nn.functional as F
import collections
from torch.optim.lr_scheduler import StepLR

# Defining Hyper Parameteres

In [2]:
gamma=0.99
lr=1e-3
min_episodes=20
eps=1
eps_decay=0.995
eps_min=0.01
update_step=10
batch_size=64
update_repeats=50
num_episodes=3000
seed=42
max_memory_size=50000
lr_gamma=0.9
lr_step=100
measure_step=100
measure_repeats=100
hidden_dim=64
horizon=np.inf
render=True
render_step=50

# Creating Environment

In [3]:
env =gym.make("CartPole-v0")

### Inderstanding the Environment

In [4]:
print('Action_space: ',env.action_space,"\n")
print('Number of actions: ', env.action_space.n,"\n")
print('Action_space sample:',env.action_space.sample(),"\n")
print('Observation_space: ',env.observation_space,"\n")
print('State shape: ', env.observation_space.shape,"\n")
print('Observation_space sample:',env.observation_space.sample(),"\n")


Action_space:  Discrete(2) 

Number of actions:  2 

Action_space sample: 1 

Observation_space:  Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32) 

State shape:  (4,) 

Observation_space sample: [-3.4024484e+00 -2.5514683e+38 -2.3285833e-01  1.3241576e+38] 



### Testing Untrained Agent

In [5]:
episodes=5
for episode in range(1,episodes+1):
    state=env.reset()
    done=False
    score=0
    
    while not done:
        env.render()
        action=env.action_space.sample()
        n_state,reward,done,info=env.step(action)
        score+=reward
    print("episode:{} score:{}".format(episode,score))

episode:1 score:12.0
episode:2 score:15.0
episode:3 score:19.0
episode:4 score:44.0
episode:5 score:17.0


In [6]:
env.close()

# Training an agent using Double DQN Algorithm

In [7]:
class QNetwork(nn.Module):
    def __init__(self, action_dim, state_dim, hidden_dim):
        super(QNetwork, self).__init__()

        self.fc_1 = nn.Linear(state_dim, hidden_dim)
        self.fc_2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc_3 = nn.Linear(hidden_dim, action_dim)

    def forward(self, inp):

        x1 = F.leaky_relu(self.fc_1(inp))
        x1 = F.leaky_relu(self.fc_2(x1))
        x1 = self.fc_3(x1)

        return x1

In [8]:
class Memory:
    def __init__(self, len):
        self.rewards = collections.deque(maxlen=len)
        self.state = collections.deque(maxlen=len)
        self.action = collections.deque(maxlen=len)
        self.is_done = collections.deque(maxlen=len)

    def update(self, state, action, reward, done):
        # if the episode is finished we do not save to new state. Otherwise we have more states per episode than rewards
        # and actions whcih leads to a mismatch when we sample from memory.
        if not done:
            self.state.append(state)
        self.action.append(action)
        self.rewards.append(reward)
        self.is_done.append(done)

    def sample(self, batch_size):
        """
        sample "batch_size" many (state, action, reward, next state, is_done) datapoints.
        """
        n = len(self.is_done)
        idx = random.sample(range(0, n-1), batch_size)

        return torch.Tensor(self.state)[idx].to(device), torch.LongTensor(self.action)[idx].to(device), \
               torch.Tensor(self.state)[1+np.array(idx)].to(device), torch.Tensor(self.rewards)[idx].to(device), \
               torch.Tensor(self.is_done)[idx].to(device)

    def reset(self):
        self.rewards.clear()
        self.state.clear()
        self.action.clear()
        self.is_done.clear()


In [9]:
def select_action(model, env, state, eps):
    state = torch.Tensor(state).to(device)
    with torch.no_grad():
        values = model(state)

    # select a random action wih probability eps
    if random.random() <= eps:
        action = np.random.randint(0, env.action_space.n)
    else:
        action = np.argmax(values.cpu().numpy())

    return action


def train(batch_size, current, target, optim, memory, gamma):

    states, actions, next_states, rewards, is_done = memory.sample(batch_size)

    q_values = current(states)

    next_q_values = current(next_states)
    next_q_state_values = target(next_states)

    q_value = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
    next_q_value = next_q_state_values.gather(1, torch.max(next_q_values, 1)[1].unsqueeze(1)).squeeze(1)
    expected_q_value = rewards + gamma * next_q_value * (1 - is_done)

    loss = (q_value - expected_q_value.detach()).pow(2).mean()

    optim.zero_grad()
    loss.backward()
    optim.step()


def evaluate(Qmodel, env, repeats):
    """
    Runs a greedy policy with respect to the current Q-Network for "repeats" many episodes. Returns the average
    episode reward.
    """
    Qmodel.eval()
    perform = 0
    for _ in range(repeats):
        state = env.reset()
        done = False
        while not done:
            state = torch.Tensor(state).to(device)
            with torch.no_grad():
                values = Qmodel(state)
            action = np.argmax(values.cpu().numpy())
            state, reward, done, _ = env.step(action)
            perform += reward
    Qmodel.train()
    return perform/repeats


def update_parameters(current_model, target_model):
    target_model.load_state_dict(current_model.state_dict())

In [None]:
torch.manual_seed(seed)
env.seed(seed)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

Q_1 = QNetwork(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0],
                                    hidden_dim=hidden_dim).to(device)
Q_2 = QNetwork(action_dim=env.action_space.n, state_dim=env.observation_space.shape[0],
                                    hidden_dim=hidden_dim).to(device)
# transfer parameters from Q_1 to Q_2
update_parameters(Q_1, Q_2)

# we only train Q_1
for param in Q_2.parameters():
    param.requires_grad = False

optimizer = torch.optim.Adam(Q_1.parameters(), lr=lr)
scheduler = StepLR(optimizer, step_size=lr_step, gamma=lr_gamma)

memory = Memory(max_memory_size)
performance = []

for episode in range(num_episodes):
    # display the performance
    if episode % measure_step == 0:
        performance.append([episode, evaluate(Q_1, env, measure_repeats)])
        print("Episode: ", episode)
        print("rewards: ", performance[-1][1])
        print("lr: ", scheduler.get_last_lr()[0])
        print("eps: ", eps)

    state = env.reset()
    memory.state.append(state)

    done = False
    i = 0
    while not done:
        i += 1
        action = select_action(Q_2, env, state, eps)
        state, reward, done, _ = env.step(action)

        if i > horizon:
            done = True

        # render the environment if render == True
        if render and episode % render_step == 0:
            env.render()

        # save state, action, reward sequence
        memory.update(state, action, reward, done)

    if episode >= min_episodes and episode % update_step == 0:
        for _ in range(update_repeats):
            train(batch_size, Q_1, Q_2, optimizer, memory, gamma)

        # transfer new parameter from Q_1 to Q_2
        update_parameters(Q_1, Q_2)

    # update learning rate and eps
    scheduler.step()
    eps = max(eps*eps_decay, eps_min)

return Q_1, performance


Episode:  0
rewards:  9.43
lr:  0.001
eps:  1




Episode:  100
rewards:  78.16
lr:  0.0009000000000000001
eps:  0.6057704364907278
Episode:  200
rewards:  200.0
lr:  0.0008100000000000001
eps:  0.3669578217261671
Episode:  300
rewards:  200.0
lr:  0.000729
eps:  0.22229219984074702
Episode:  400
rewards:  197.88
lr:  0.0006561000000000001
eps:  0.1346580429260134
Episode:  500
rewards:  199.24
lr:  0.00059049
eps:  0.08157186144027828
Episode:  600
rewards:  199.54
lr:  0.000531441
eps:  0.0494138221100385
Episode:  700
rewards:  199.42
lr:  0.0004782969
eps:  0.029933432588273214
Episode:  800
rewards:  195.81
lr:  0.00043046721
eps:  0.018132788524664028
Episode:  900
rewards:  197.56
lr:  0.000387420489
eps:  0.01098430721937979
Episode:  1000
rewards:  196.65
lr:  0.0003486784401
eps:  0.01
Episode:  1100
rewards:  199.56
lr:  0.00031381059609000004
eps:  0.01
Episode:  1200
rewards:  179.52
lr:  0.00028242953648100003
eps:  0.01


In [None]:
env.close()