-
Notifications
You must be signed in to change notification settings - Fork 262
/
qlearner.py
158 lines (120 loc) · 4.84 KB
/
qlearner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
from collections import OrderedDict
from typing import Dict, Union
from axelrod.action import Action, actions_to_str
from axelrod.player import Player
Score = Union[int, float]
C, D = Action.C, Action.D
class RiskyQLearner(Player):
"""A player who learns the best strategies through the q-learning
algorithm.
This Q learner is quick to come to conclusions and doesn't care about the
future.
Names:
- Risky Q Learner: Original name by Geraint Palmer
"""
name = "Risky QLearner"
classifier = {
"memory_depth": float("inf"), # Long memory
"stochastic": True,
"long_run_time": False,
"inspects_source": False,
"manipulates_source": False,
"manipulates_state": False,
}
learning_rate = 0.9
discount_rate = 0.9
action_selection_parameter = 0.1
memory_length = 12
def __init__(self) -> None:
"""Initialises the player by picking a random strategy."""
super().__init__()
# Set this explicitly, since the constructor of super will not pick it up
# for any subclasses that do not override methods using random calls.
self.classifier["stochastic"] = True
self.prev_action = None # type: Action
self.original_prev_action = None # type: Action
self.score = 0
self.Qs = OrderedDict({"": OrderedDict(zip([C, D], [0, 0]))})
self.Vs = OrderedDict({"": 0})
self.prev_state = ""
def receive_match_attributes(self):
(R, P, S, T) = self.match_attributes["game"].RPST()
self.payoff_matrix = {C: {C: R, D: S}, D: {C: T, D: P}}
def strategy(self, opponent: Player) -> Action:
"""Runs a qlearn algorithm while the tournament is running."""
if len(self.history) == 0:
self.prev_action = self._random.random_choice()
self.original_prev_action = self.prev_action
state = self.find_state(opponent)
reward = self.find_reward(opponent)
if state not in self.Qs:
self.Qs[state] = OrderedDict(zip([C, D], [0, 0]))
self.Vs[state] = 0
self.perform_q_learning(self.prev_state, state, self.prev_action, reward)
action = self.select_action(state)
self.prev_state = state
self.prev_action = action
return action
def select_action(self, state: str) -> Action:
"""
Selects the action based on the epsilon-soft policy
"""
rnd_num = self._random.random()
p = 1.0 - self.action_selection_parameter
if rnd_num < p:
return max(self.Qs[state], key=lambda x: self.Qs[state][x])
return self._random.random_choice()
def find_state(self, opponent: Player) -> str:
"""
Finds the my_state (the opponents last n moves +
its previous proportion of playing C) as a hashable state
"""
prob = "{:.1f}".format(opponent.cooperations)
action_str = actions_to_str(opponent.history[-self.memory_length :])
return action_str + prob
def perform_q_learning(self, prev_state: str, state: str, action: Action, reward):
"""
Performs the qlearning algorithm
"""
self.Qs[prev_state][action] = (1.0 - self.learning_rate) * self.Qs[prev_state][
action
] + self.learning_rate * (reward + self.discount_rate * self.Vs[state])
self.Vs[prev_state] = max(self.Qs[prev_state].values())
def find_reward(self, opponent: Player) -> Dict[Action, Dict[Action, Score]]:
"""
Finds the reward gained on the last iteration
"""
if len(opponent.history) == 0:
opp_prev_action = self._random.random_choice()
else:
opp_prev_action = opponent.history[-1]
return self.payoff_matrix[self.prev_action][opp_prev_action]
class ArrogantQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning
algorithm.
This Q learner jumps to quick conclusions and cares about the future.
Names:
- Arrogant Q Learner: Original name by Geraint Palmer
"""
name = "Arrogant QLearner"
learning_rate = 0.9
discount_rate = 0.1
class HesitantQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning algorithm.
This Q learner is slower to come to conclusions and does not look ahead much.
Names:
- Hesitant Q Learner: Original name by Geraint Palmer
"""
name = "Hesitant QLearner"
learning_rate = 0.1
discount_rate = 0.9
class CautiousQLearner(RiskyQLearner):
"""A player who learns the best strategies through the q-learning algorithm.
This Q learner is slower to come to conclusions and wants to look ahead
more.
Names:
- Cautious Q Learner: Original name by Geraint Palmer
"""
name = "Cautious QLearner"
learning_rate = 0.1
discount_rate = 0.1