# Reinforcement Learning

# Value function approximation

This notebook presents value function approximation for large-scale reinforcement learning.

We use a neural network with a single hidden layer.

In [23]:
import numpy as np
import os
from matplotlib import pyplot as plt
from classes.model import TicTacToe, ConnectFour
from classes.agent import Agent
from sklearn.neural_network import MLPRegressor

## Tic-Tac-Toe

In [24]:
game = TicTacToe()

# use warm start to update the neural network with new samples
regressor = MLPRegressor(warm_start=True)

# input = player position + adversary position
input_dim = 2 * np.prod(game.Board_Size)

# init the regressor
regressor.fit(np.ones((1, input_dim)), np.zeros(1))

In [25]:
#np.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [26]:
regressor.predict(np.array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])).shape

(1,)

In [27]:
agent = Agent(game)
state = game.state
stop, states, rewards = agent.get_episode()

In [28]:
"""X = []
X.append(encode_state(states[0]))
X.append(encode_state(states[1]))
X = np.array(X)
X.shape"""

'X = []\nX.append(encode_state(states[0]))\nX.append(encode_state(states[1]))\nX = np.array(X)\nX.shape'

In [29]:
X

NameError: name 'X' is not defined

In [None]:
regressor.fit(X, np.array([1,1]))

In [None]:
regressor.predict(X)

array([0.0820594 , 0.03587855])

In [33]:
def encode_state(state):
    """Encode the state as a binary vector."""
    board = state[1].flatten()
    return np.hstack((board > 0, board < 0))

In [30]:
def policy(state, regressor, game, player=1):
    """Best policy according to the current value approximation."""
    probs = []
    actions = []
    current_player, _ = state
    if current_player == player:
        actions = game.get_actions(state)
        next_states = [game.get_next_state(state, action) for action in actions]
        # to be modified (get the approximate value of each next state)
        # ---
        #values = [0 for next_state in next_states]
        codes = np.array([encode_state(next_state) for next_state in next_states])
        #values = [regressor.predict([encode_state(next_state)]) for next_state in next_states]
        values = regressor.predict(codes)
        # ---
        actions = [actions[np.argmax(values)]]
        probs = [1]
    return probs, actions

In [31]:
def train(agent, n_games=500):
    """Collect samples, train the regressor through Monte-Carlo learning and return the agent."""
    # to be modified
    # ---    
    X_train = []
    y_train = []

    for _ in range(n_games):
        
        _, states, rewards = agent.get_episode()
        gain = rewards[-1]
        X_train += [encode_state(state) for state in states]
        y_train += len(states) * [gain]
    X_train, y_train = shuffle(X_train, y_train)
    regressor.fit(X_train, y_train)
    """for state in states:
            X_train.append(encode_state(state))
            y_train.append(rewards[-1])
        X_train = np.array(X_train)
        #y_train = np.array(y_train)

        if len(y_train):
            regressor.fit(X_train, y_train)"""
    # ----
    game = agent.model
    player = agent.player
    agent = Agent(game, lambda state: policy(state, regressor, game, player))
    return agent

In [34]:
agent = Agent(game, policy = lambda state: policy(state, regressor, game))
gains = agent.get_gains()

In [None]:
np.unique(gains, return_counts=True)

In [None]:
agent = train(agent)

In [None]:
gains = agent.get_gains()

In [None]:
np.unique(gains, return_counts=True)

## To do

* Complete the functions ``policy`` and ``train``.
* Observe the strength of the player after training (you might adapt the number of games).
* Do the same with a stronger adversary (e.g., a perfect player).
* Do the same experiments with Connect Four.
* Test another neural network architecture (e.g., with 2 hidden layers).

## To do (optional)

Compare the above algorithm based on Monte-Carlo learning with:
* TD learning
* Q-learning