-
Notifications
You must be signed in to change notification settings - Fork 126
/
policy.py
65 lines (50 loc) · 1.69 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/bin/python
import sys
sys.path.append("./secret")
import grid_mdp
import random
random.seed(0)
import numpy as np
class Policy:
def __init__(self, grid, epsilon):
self.actions = grid.actions
grid.start();
t,hats,r = grid.receive(self.actions[0]);
self.theta = [ 0.0 for i in xrange(len(hats)*len(self.actions)) ]
self.theta = np.array(self.theta);
self.theta = np.transpose(self.theta);
self.epsilon = epsilon
def get_fea_vec(self, fea, a):
f = np.array([0.0 for i in xrange(len(self.theta))]);
idx = 0
for i in xrange(len(self.actions)):
if a == self.actions[i]: idx = i;
for i in xrange(len(fea)):
f[i + idx * len(fea)] = fea[i];
return f
def qfunc(self, fea, a):
f = self.get_fea_vec(fea, a);
return np.dot(f, self.theta);
def epsilon_greedy(self, fea):
## max q action
epsilon = self.epsilon;
amax = 0
qmax = self.qfunc(fea, self.actions[0])
for i in xrange(len(self.actions)):
a = self.actions[i]
q = self.qfunc(fea, a)
if qmax < q:
qmax = q;
amax = i;
##probability
pro = [0.0 for i in xrange(len(self.actions))]
pro[amax] += 1- epsilon
for i in xrange(len(self.actions)):
pro[i] += epsilon / len(self.actions)
##choose
r = random.random()
s = 0.0
for i in xrange(len(self.actions)):
s += pro[i]
if s >= r: return self.actions[i]
return self.actions[len(self.actions)-1]