#Aprendizado por Reforço e o Dilema do Prisioneiro
Neste estudo contaremos com a ajuda da biblioteca Axelrod (citar artigo) para simular rodadas do jogo Dilema do Prisioneiro. O jogo inventado por

# Referências

Axelrod:

                  https://hash.ai/blog/improving-the-prisoners-dilemma-with-q-learning
                  https://github.com/Axelrod-Python/Axelrod
                  https://axelrod.readthedocs.io/en/fix-documentation/tutorials/index.html


Game Theory

In [1]:
!pip install axelrod

Collecting axelrod
  Downloading Axelrod-4.13.0-py2.py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.4/348.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: axelrod
Successfully installed axelrod-4.13.0


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import axelrod as axl
from matplotlib import colors
from matplotlib.ticker import PercentFormatter



"""
The payoff is that in case the initiator agent cooperate they are rewarded
         2 and 0 points, if the participant, either cooperates back or betrays
On the other hand, if the initiator decides to betray it will be rewarded
         3 and 1 points, depending on the participant's actions

         Matrix = CC CD
                  DC DD

                  C = Cooperates, D = Defects
                  CD = Initiator cooperated and participant defected


"""
print(' ')

 


In [3]:
from collections import OrderedDict
from typing import Dict, Union

from axelrod.action import Action, actions_to_str
from axelrod.player import Player

Score = Union[int, float]

C, D = Action.C, Action.D

class MyQLearner(Player):
    name = "My QLearner"
    classifier = {
        "memory_depth": float("inf"),  # Long memory
        "stochastic": True,
        "long_run_time": False,
        "inspects_source": False,
        "manipulates_source": False,
        "manipulates_state": False,
    }
    learning_rate = 0.9
    discount_rate = 0.1
    action_selection_parameter = 0.15
    memory_length = 10

    def __init__(self) -> None:
        """Initialises the player by picking a random strategy."""

        super().__init__()

        # Set this explicitly, since the constructor of super will not pick it up
        # for any subclasses that do not override methods using random calls.
        self.classifier["stochastic"] = True

        self.prev_action = None  # type: Action
        self.original_prev_action = None  # type: Action
        self.score = 0
        self.Qs = OrderedDict({"": OrderedDict(zip([C, D], [5, 5]))})
        self.Vs = OrderedDict({"": 5})
        self.prev_state = ""

    def receive_match_attributes(self):
        (R, P, S, T) = self.match_attributes["game"].RPST()
        self.payoff_matrix = {C: {C: R, D: S}, D: {C: T, D: P}}

    def strategy(self, opponent: Player) -> Action:
        """Runs a qlearn algorithm while the tournament is running."""
        if len(self.history) == 0:
            self.prev_action = self._random.random_choice()
            self.original_prev_action = self.prev_action
        state = self.find_state(opponent)
        reward = self.find_reward(opponent)
        if state not in self.Qs:
            self.Qs[state] = OrderedDict(zip([C, D], [5, 5]))
            self.Vs[state] = 5
        self.perform_q_learning(self.prev_state, state, self.prev_action, reward)
        action = self.select_action(state)
        self.prev_state = state
        self.prev_action = action
        return action

    def select_action(self, state: str) -> Action:
        """
        Selects the action based on the epsilon-soft policy
        """
        rnd_num = self._random.random()
        p = 1.0 - self.action_selection_parameter

        if len(self.history)%5 == 0 and len(self.history) > 0:
          self.action_selection_parameter -= 0.015

        if rnd_num < p:
            return max(self.Qs[state], key=lambda x: self.Qs[state][x])
        return self._random.random_choice()

    def find_state(self, opponent: Player) -> str:
        """
        Finds the my_state (the opponents last n moves +
        its previous proportion of playing C) as a hashable state
        """
        if len(opponent.history) == 0:
          prob = "0.0"
        else:
          prob = "{:.1f}".format(opponent.cooperations/len(opponent.history))

        action_str = actions_to_str(opponent.history[-self.memory_length:])

        return action_str + prob

    def perform_q_learning(self, prev_state: str, state: str, action: Action, reward):
        """
        Performs the qlearning algorithm
        """
        self.Qs[prev_state][action] = (1.0 - self.learning_rate) * self.Qs[prev_state][
            action
        ] + self.learning_rate * (reward + self.discount_rate * self.Vs[state])
        self.Vs[prev_state] = max(self.Qs[prev_state].values())

    def find_reward(self, opponent: Player) -> Dict[Action, Dict[Action, Score]]:
        """
        Finds the reward gained on the last iteration
        """

        if len(opponent.history) == 0:
            opp_prev_action = self._random.random_choice()
        else:
            opp_prev_action = opponent.history[-1]
        return self.payoff_matrix[self.prev_action][opp_prev_action]


In [4]:
players = [MyQLearner(), axl.ArrogantQLearner(),
           axl.HesitantQLearner(), axl.CautiousQLearner(),
           axl.RiskyQLearner(), axl.Grudger()]

tournament = axl.Tournament(
      players=players,
      turns=200,
      repetitions=5)
results = tournament.play(progress_bar=False)
print('\n')

#for name in results.ranked_names:
     #print(name)


for i in range(len(results.players)):
  print(results.players[i], np.sum(results.scores[i]))



My QLearner 16776
Arrogant QLearner 11129
Hesitant QLearner 11155
Cautious QLearner 11221
Risky QLearner 11179
Grudger 9075


In [None]:
competitors = [MyQLearner(), axl.TitForTat(), axl.Cooperator(), axl.Defector()]
n = len(competitors)
total_scores = np.zeros((n, 2))
total_coop = np.zeros((n, 2))

for x in range(n):
  sum = 0
  sum2 = 0
  cooperation = np.array([0, 0])
  scores = np.array([0, 0])
  for _ in range(500):
    players = [MyQLearner(), competitors[x]]

    match2 = axl.Match(players, turns=200, match_attributes={"length": float('inf')})

    resultados = match2.play()

    scores[0] += match2.final_score()[0]
    scores[1] += match2.final_score()[1]

    cooperation[0] += match2.cooperation()[0]
    cooperation[1] += match2.cooperation()[1]

  total_coop[x] = cooperation/500
  total_scores[x] = scores/500
  print("Coop: ", cooperation/500)
  print("Placar médio: ", scores/500, "\n")


Coop:  [139.182 140.496]
Placar médio:  [524.222 517.652] 

Coop:  [161.388 161.54 ]
Placar médio:  [542.532 541.772] 



In [None]:
strats = [x.name for x in competitors]
dict_scores = {"Pontuação": list(total_scores[:, 0])}
dict_scores["Oponente"] = list(total_scores[:, 1])

x = np.arange(len(strats))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(figsize = (12, 6), layout='constrained')

for attribute, measurement in dict_scores.items():
    offset = width * multiplier*1.2
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Pontuação média')
ax.set_title('Desempenho individual da estratégia ' + strats[0])
ax.set_xticks(x + width, strats)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 1100)
plt.savefig("desempenho_"+ strats[0] +".png")

plt.show()

In [None]:
strats = [x.name for x in competitors]
dict_scores = {"Pontuação": list(total_coop[:, 0])}
dict_scores["Oponente"] = list(total_coop[:, 1])

x = np.arange(len(strats))  # the label locations
width = 0.25  # the width of the bars
multiplier = 0

fig, ax = plt.subplots(figsize = (12, 6), layout='constrained')

for attribute, measurement in dict_scores.items():
    offset = width * multiplier*1.2
    rects = ax.bar(x + offset, measurement, width, label=attribute)
    ax.bar_label(rects, padding=3)
    multiplier += 1

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Cooperação média' )
ax.set_title('Desempenho individual da estratégia '+ strats[0])
ax.set_xticks(x + width, strats)
ax.legend(loc='upper left', ncols=3)
ax.set_ylim(0, 250)
plt.savefig("coop_"+ strats[0] +".png")

plt.show()

In [None]:
sum = 0
n_participants = 10
sum2 = np.zeros(n_participants)

for j in range(25):
  tournament_participants = [s() for s in axl.axelrod_first_strategies[:n_participants-5]]
  tournament_participants += [MyQLearner(), axl.ArrogantQLearner(),
           axl.HesitantQLearner(), axl.CautiousQLearner(),
           axl.RiskyQLearner()]

  tournament = axl.Tournament(
      players=tournament_participants,
      turns=200,
      repetitions=5,
      noise=0.0
      )

  results = tournament.play(progress_bar=False)

  if "My QLearner" == results.ranked_names[0]:
    sum += 1
  for i in range(len(results.players)):
    sum2[i] += np.sum(results.scores[i])

print("Frequencia de pódio: ", sum)



In [None]:
fig, ax = plt.subplots(figsize=(15, 8))

names = [s.split(":")[0] for s in results.ranked_names[::-1]]
names = [s.split(" by ")[-1] for s in names]
#names = results.ranked_names[::-1]
counts = sum2/25
counts = sorted(counts)

colors = ["#1984c5", "#22a7f0", "#63bff0", "#e1a692", "#de6e56", "#e14b31", "#c23728"]
# "#a7d5ed", "#e2e2e2",
axis = ax.bar(names, counts, label=names, color=colors)
ax.bar_label(axis, padding=2)

plt.ylim([np.min(counts)-100, np.max(counts)+150])
ax.set_ylabel('Pontuação média em jogos de 200 rodadas e 5 repetições')
ax.set_title('Estratégias fixas X Agente de Q-Learning')
ax.legend()
plt.savefig("tournament6.png")

plt.show()

In [None]:
for i in range(len(results.players)):
  print(results.players[i], results.scores[i], results.wins[i])

# random decai 15% em 1.5%5
# init 5
# learning rate 0.5
#