-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
86 lines (69 loc) · 2.72 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from distributions import MultiHeadCategorical
import torch
from utils import init
import torch.nn as nn
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.size(0), -1)
class Model(nn.Module):
def __init__(self, state_dim, action_dim, device, trainable=True, hidsize=128):
super(Model, self).__init__()
init_ = lambda m: init(m,
nn.init.orthogonal_,
lambda x: nn.init.constant_(x, 0))
# feature extract
self.base = nn.Sequential(
init_(nn.Linear(state_dim, 128)),
nn.ReLU(),
init_(nn.Linear(128, hidsize)),
nn.ReLU()
).to(device)
# actor
self.dist = MultiHeadCategorical(hidsize, 1, action_dim, device)
# # critic
# self.critic = nn.Sequential(
# init_(nn.Linear(hidsize, 1))
# ).to(device)
# critic
self.q_network = nn.Sequential(
init_(nn.Linear(hidsize, action_dim)),
).to(device)
self.device = device
self.identity = torch.eye(action_dim).to(device)
if trainable:
self.train()
else:
self.eval()
# @torchsnooper.snoop()
def act(self, inputs):
with torch.no_grad():
obs_feature = self.base(inputs)
# value = self.critic(obs_feature)
self.dist(obs_feature)
action = self.dist.sample()
action_log_probs = self.dist.log_probs(action)
action_log_probs = action_log_probs.mean(-1, keepdim=True)
q_value = self.q_network(obs_feature)
# mean
value = torch.sum(self.dist.probs * q_value, -1, keepdim=True)
return value, action.squeeze(), action_log_probs
def get_value(self, inputs):
obs_feature = self.base(inputs)
# value = self.critic(obs_feature)
self.dist(obs_feature)
q_value = self.q_network(obs_feature)
value = torch.sum(self.dist.probs * q_value, -1, keepdim=True)
return value
def evaluate_actions(self, inputs, action):
obs_features = self.base(inputs)
# value = self.critic(obs_features)
q_value = self.q_network(obs_features)
index = self.identity[action.squeeze(-1)]
value = torch.sum(q_value * index, -1).unsqueeze(-1)
self.dist(obs_features)
action_log_probs = self.dist.log_probs(action).mean(-1, keepdim=True)
dist_entropy = self.dist.entropy().mean()
return value, action_log_probs, dist_entropy
def print_grad(self):
for name, p in self.named_parameters():
print('name: ', name, ' value: ', p.grad.mean(), 'p.requires_grad', p.requires_grad)