In [7]:
import numpy as np
from tqdm import tqdm
# import gym
# from gym import spaces

import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import MultivariateNormal
import matplotlib.pyplot as plt

from ReplayBuffer import ReplayBuffer
from environment import Environment

In [8]:
device = T.device("cuda" if T.cuda.is_available() else "cpu")
print(device)

cpu


In [9]:
env = Environment(state = [0, 1, 0, -1, 0, 0], mu = 0.05, m = 1, g = 9.81, thetamin = 0, thetamax = np.pi, phimin = 0, phimax = 2*np.pi, Tmin = 0, Tmax = 20, dt = 0.02, dphi = 0.0175, dtheta = 0.0175)

In [10]:
class DQN(nn.Module):
    def __init__(self, lr, input_dim, fc1_dims, fc2_dims, output_dim):
        super(DQN, self).__init__()
        self.input_dim = input_dim
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.output_dim = output_dim

        self.fc1 = nn.Linear(input_dim, fc1_dims)
        f1 = 1./np.sqrt(self.fc1.weight.data.size()[0])
        T.nn.init.uniform_(self.fc1.weight.data, -f1, f1)
        T.nn.init.uniform_(self.fc1.bias.data, -f1, f1)
        self.bn1 = nn.LayerNorm(self.fc1_dims)

        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        f2 = 1./np.sqrt(self.fc2.weight.data.size()[0])
        T.nn.init.uniform_(self.fc2.weight.data, -f2, f2)
        T.nn.init.uniform_(self.fc2.bias.data, -f2, f2)
        self.bn2 = nn.LayerNorm(self.fc2_dims)

        f3 = 0.003
        self.fc3 = nn.Linear(fc2_dims, output_dim)
        T.nn.init.uniform_(self.fc3.weight.data, -f3, f3)
        T.nn.init.uniform_(self.fc3.bias.data, -f3, f3)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.to(device)
    def forward(self, x):
        x = T.relu(self.bn1(self.fc1(x)))
        x = T.relu(self.bn2(self.fc2(x)))
        x = self.fc3(x)
        return x

In [11]:
class DQN_Agent(object):
    def __init__(self, lr, input_dim, input_dims_buff, fc1_dims, fc2_dims, gamma, n_actions, epsilon, batch_size,
                 buffer_size=1000000, eps_end=0.01, eps_dec=5e-7, N = 100):
        self.lr = lr
        self.input_dim = input_dim
        self.gamma = gamma
        self.n_actions = n_actions
        self.epsilon = epsilon
        self.batch_size = batch_size
        self.eps_min = eps_end
        self.eps_dec = eps_dec

        self.action_space = [i for i in range(n_actions)]
        self.memory = ReplayBuffer(buffer_size, input_dims_buff)
        self.QNetwork = DQN(lr, input_dim, fc1_dims, fc2_dims, n_actions)
        self.QNetwork_target = DQN(lr, input_dim, fc1_dims, fc2_dims, n_actions)
        self.update_network_parameters(tau = 1)
    
    def update_network_parameters(self, tau=None):
        if tau is None:
            tau = self.tau
        qn_params = self.QNetwork.named_parameters()
        target_qn_params = self.QNetwork_target.named_parameters()
        qn_dict = dict(qn_params)
        target_qn_dict = dict(target_qn_params)
        for name in qn_dict:
            qn_dict[name] = tau*qn_dict[name].clone() + (1-tau)*target_qn_dict[name].clone()
        self.QNetwork_target.load_state_dict(qn_dict)

    def choose_action(self, state, epsilon = None):
        if epsilon is None:
            epsilon = self.epsilon
        self.QNetwork.eval()
        if np.random.random() > epsilon:
            state = T.tensor([state], dtype=T.float).to(device)
            actions = self.QNetwork.forward(state)
            action = T.argmax(actions).item()
        else:
            action = np.random.choice(self.n_actions)
        self.QNetwork.train()
        return env.actionspace()[action]

    def remember(self, state, action, reward, new_state, done):
        self.memory.add_sample(state, action, reward, new_state, done)
    
    def learn(self, step):
        if self.memory.count < self.batch_size:
            return
        states, actions, rewards, new_states, dones = self.memory.return_sample(self.batch_size)

        states = T.tensor(states, dtype=np.float32).to(device)
        actions = T.tensor(actions, dtype=np.float32).to(device)
        rewards = T.tensor(rewards, dtype=np.float32).to(device)
        new_states = T.tensor(new_states, dtype=np.float32).to(device)
        dones = T.tensor(dones).to(device)

        self.QNetwork.eval()
        all_qsas = self.QNetwork.forward(states)
        opt_qsas = T.max(all_qsas, dim = 1)
        self.QNetwork_target.eval()
        next_qsas = self.QNetwork_target.forward(new_states)
        next_opt_qsas = T.max(next_qsas, dim = 1)

        bellmanoptimality = []
        for j in range(self.batch_size):
            bellmanoptimality.append(rewards[j] + self.gamma*next_opt_qsas[j]*dones[j])
        bellmanoptimality = T.tensor(bellmanoptimality).to(device)
        bellmanoptimality = bellmanoptimality.view(self.batch_size, 1)

        self.QNetwork.train()
        self.QNetwork.optimizer.zero_grad()
        loss = F.mse_loss(bellmanoptimality, opt_qsas)
        loss.backward()
        self.QNetwork.optimizer.step()

        if step % self.N == 0:
            self.update_network_parameters()

In [12]:
Agent = DQN_Agent(lr=0.0001, input_dim=6, input_dims_buff = [6,3], fc1_dims=256, fc2_dims=256, gamma=0.99, n_actions=3, epsilon=0.3, batch_size=64, buffer_size=1000000, eps_end=0.2, eps_dec=0.0001, N=100)

num_episodes = 50
max_steps = 300

Average_Rewards = []
for i in tqdm(range(num_episodes)):
    done = False
    step = 0
    state = [0, 1, 0, -1, 0, 0]
    total_reward = 0
    while not done and step < max_steps:
        action = Agent.choose_action(state)
        print(step)
        new_state, reward, done = env.infostep(step, action)
        Agent.remember(state, action, reward, new_state, done)
        Agent.learn(step)
        print(step)  
        state = new_state
        total_reward += reward
        step += 1
    Average_Rewards.append(total_reward)
    if epsilon > Agent.eps_min:
        epsilon = epsilon - Agent.eps_dec
    print('episode ', i, 'score %.1f' % total_reward)

  0%|          | 0/50 [00:00<?, ?it/s]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52


In [None]:
plt.plot(np.arange(len(Average_Rewards)), Average_Rewards)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('Average Reward vs Episode')
plt.show()