# Reinforcement Learning

# 4. Online control

This notebook presents the **online control** of an agent by SARSA and Q-learning.

In [1]:
import numpy as np

In [2]:
from model import TicTacToe, Nim, ConnectFour
from agent import Agent, OnlineControl
from dynamic import ValueIteration

## To do

* Complete the class ``SARSA`` and test it on Tic-Tac-Toe.
* Complete the class ``QLearning`` and test it on Tic-Tac-Toe.
* Compare these algorithms on Tic-Tac-Toe (play first) and Nim (play second), using a random adversary, then a perfect adversary. Comment your results.
* Test these algorithms on Connect 4 against a random adversary. Comment your results.

## SARSA

In [3]:
class SARSA(OnlineControl):
    """Online control by SARSA."""
        
    def update_values(self, state=None, horizon=100, epsilon=0.5):
        """Learn the action-value function online."""
        self.model.reset(state)
        state = self.model.state
        if not self.model.is_terminal(state):
            action = self.randomize_best_action(state, epsilon=epsilon)
            for t in range(horizon):
                code = self.model.encode(state)
                self.action_count[code][action] += 1
                reward, stop = self.model.step(action)
                # to be modified (get sample gain)
                # begin
                next_state = self.model.state
                next_code = self.model.encode(next_state)
                if not stop:
                    next_action = self.randomize_best_action(next_state, epsilon=epsilon)
                    gain = reward + self.gamma * self.action_value[next_code][next_action]
                else:
                    gain = reward
                # end
                diff = gain - self.action_value[code][action]
                count = self.action_count[code][action]
                self.action_value[code][action] += diff / count
                if stop:
                    break
                # to be modified (update state and action)
                # begin
                state = next_state
                action = next_action
                # end

## Q-learning

In [4]:
class QLearning(OnlineControl):
    """Online control by Q-learning."""
        
    def update_values(self, state=None, horizon=100, epsilon=0.5):
        """Learn the action-value function online."""
        self.model.reset(state)
        state = self.model.state
        # to be completed
        if not self.model.is_terminal(state):
            for t in range(horizon):
                code = self.model.encode(state)
                action = self.randomize_best_action(state, epsilon=epsilon)
                self.action_count[code][action] += 1
                reward, stop = self.model.step(action)
                next_state = self.model.state
                next_code = self.model.encode(next_state)
                if not stop:
                    best_action = self.get_best_actions(next_state)[0]
                    gain = reward + self.gamma * self.action_value[next_code][best_action]
                else:
                    gain = reward
                diff = gain - self.action_value[code][action]
                count = self.action_count[code][action]
                self.action_value[code][action] += diff / count
                if stop:
                    break
                state = next_state

## To do

### TicTacToe

#### Random policy

In [5]:
Game = TicTacToe

In [6]:
game = Game()
agent = Agent(game)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.301

#### SARSA

##### Random adversary

In [7]:
game = Game()
agent = Agent(game)
Control = SARSA
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.814

##### Perfect adversary

In [None]:
game = Game()
algo_optimal = ValueIteration(game)
_, adversary_policy = algo_optimal.get_perfect_players()
game = Game(adversary_policy=adversary_policy)
Control = SARSA
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

#### Q_learning

##### Random adversary

In [None]:
game = Game()
agent = Agent(game)
Control = QLearning
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

##### Perfect adversary

In [None]:
game = Game()
algo_optimal = ValueIteration(game)
_, adversary_policy = algo_optimal.get_perfect_players()
game = Game(adversary_policy=adversary_policy)
Control = QLearning
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

### Nim

#### Random policy

In [8]:
Game = Nim

In [9]:
game = Game(play_first=False)
agent = Agent(game)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.004

#### SARSA

##### Random adversary

In [10]:
game = Game(play_first=False)
agent = Agent(game)
Control = SARSA
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.582

##### Perfect adversary

In [12]:
game = Game(play_first=False)
algo_optimal = ValueIteration(game)
_, adversary_policy = algo_optimal.get_perfect_players()
game = Game(adversary_policy=adversary_policy, play_first=False)
Control = SARSA
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

-0.314

#### Q_learning

##### Random adversary

In [11]:
game = Game(play_first=False)
agent = Agent(game)
Control = QLearning
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.694

##### Perfect adversary

In [None]:
game = Game(play_first=False)
algo_optimal = ValueIteration(game)
_, adversary_policy = algo_optimal.get_perfect_players()
game = Game(adversary_policy=adversary_policy, play_first=False)
Control = QLearning
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

Comments: It seems that Q-Learning performs better than SARSA.

### Connect Four

#### Random policy

In [13]:
Game = ConnectFour

In [None]:
game = Game()
agent = Agent(game)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

#### SARSA

##### Random adversary

In [14]:
game = Game()
agent = Agent(game)
Control = SARSA
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

0.138

#### Q-Learning

##### Random adversary

In [None]:
game = Game()
agent = Agent(game)
Control = QLearning
algo = Control(game)
n_games = 1000
for i in range(n_games):
    algo.update_values(epsilon=0.1)
policy = algo.get_policy()
agent = Agent(game, policy)
gains = agent.get_gains(n_runs=1000)
np.mean(gains)

Comment: As usual, Q-Learning performs better than SARSA.