<a href="https://colab.research.google.com/github/EmmanuelJhno/Reinforcement_learning/blob/master/TP3_2020v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TP3: Model Free

### Description: 

In this session, we are exploring a simple version of a game
(simpler version of Perudo: https://www.youtube.com/watch?v=die0n-eonl8).
Using the rules of the game, we first construct an environment. 
  
There is below a simple code where the game is played using a random statregy.

Also, there are two functions to display the optimal value functions and optimal policies.


### TO DO:

1) Implement MC, SARSA, Q-learning to learn the value function. It is recommended to use the indications of code below.


2) For the 3 cases, display the value function and the optimal policy found. 

3) Create a new environment which takes as parameter a given policy. Then implement iterations where you find the optimal value function for a given adversarial policy and then you play in turn against this policy.
Display the results after some iterations. Comment.

In [0]:
from IPython.display import Image
from IPython.core.display import HTML 


In [0]:
import gym
import numpy as np
import random
from gym import spaces
from gym.utils import seeding

THE GAME

**Rules**:

- 2 players 
- Each player has 5 coins (head or tail). Each player only sees her coins.  
- After flipping each coin, the game starts.
- The game consists in guessing how many heads are present between all coins (or make the other player guess wrongly).
- Bets start at 0 head.
- The starting player is chosen at random. (Flip a coin)
- Possible actions:
    * the player keeps the actual bet and passes.
    * the player add 1 to the actual bet (estimate of the total number of heads).
- The game stops when one player passes.
- if the bet is strictly bigger than the real number, the last player to play looses (r=-1) and the other wins (r=1). if the bet is smaller or equal, the last player wins (r=1) and the other looses (r=-1).
- IA initialisation of the computer strategy: if the bet  is smaller than 2+ quantity of observed own heads, bets, otherwise passes.

In [0]:
# Environment and rules

def throw_coin(num_coin, np_random):
    return np_random.rand(num_coin)>0.5

def total_faces(list_players):
    RV=0
    for player in list_players:
        RV += sum(player)
    return RV

class PerudoSimplificado(gym.Env):
    def __init__(self):
        
        self.action_space = spaces.Discrete(2)
        self.observation_space = spaces.Tuple((
            spaces.Discrete(5), #mis monedas
            spaces.Discrete(10))) #apuesta actual

        self.seed()
        # Empieza el juego
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action)
        
        max_guess_player_2 = total_faces([self.player_2]) + int(len(self.player_1)/2)
        faces_tot = total_faces([self.player_1, self.player_2])
        
        if self.guess > len(self.player_1) + len(self.player_2): #the bet is bigger than the max possible
            done = True
            reward = -1

        if action == 0: #action == 0, maintain the bet and pass 
            done = True
            if self.guess <= faces_tot: #the other player was right
                reward = -1
            else: #I was right
                reward = -1
            
        else: #action == 1, add 1 in the bet 
            self.guess += 1
            if self.guess < max_guess_player_2: # the other player adds 1
                self.guess += 1 
                done = False
                reward = 0
            else: # other player passes
                done = True
                if self.guess <= faces_tot: 
                    reward = 1
                else:
                    reward = -1
            
        return self.get_obs(), reward, done, {}

    def get_obs(self):
        return (sum(self.player_1), self.guess)
    
    def reset(self):
        self.player_1 = throw_coin(5,self.np_random)
        self.player_2 = throw_coin(5,self.np_random)
        self.guess = 1 if np.random.rand()>0.5 else 0 # flip a coin to see who starts.
        return self.get_obs()

## Playing at random

In [0]:
env = PerudoSimplificado()
print(env.observation_space)
print(env.action_space)

In [0]:
#Politica Random:
for i_episode in range(5):
    state = env.reset()
    while True:
        action = env.action_space.sample() # Selects a random action 
        state, reward, done, info = env.step(action) # Plays one round
        print(state,action)
        if done:
            print('Game over! Your reward: ', reward)
            print('You win :)\n') if reward > 0 else print('You lost:(\n')
            break

## Graficos:

In [0]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

def plot_values(V):
    """
    Does a 3D display of the value function.
        
    The parameter V describes the value function in function of the number of "heads in your hand"
    and "actual bet".
    """
    def get_Z(x, y):
        if (x,y) in V:
            return V[x,y]
        else:
            return 0

    def get_figure(ax):
        x_range = np.arange(0, 6)
        y_range = np.arange(1, 11)
        X, Y = np.meshgrid(x_range, y_range)
        
        Z = np.array([get_Z(x,y) for x,y in zip(np.ravel(X), np.ravel(Y))]).reshape(X.shape)

        surf = ax.plot_surface(X, Y, Z, rstride=1, cstride=1, cmap=plt.cm.coolwarm, vmin=-1.0, vmax=1.0)
        ax.set_xlabel('heads')
        ax.set_ylabel('bets')
        ax.set_zlabel('value')
        ax.view_init(ax.elev, -120)
    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot(211, projection='3d')
    get_figure(ax)
    plt.show()

In [0]:
def plot_policy(policy):
    """
    3D graphic of value function.
    
     policy is a function of "heads" 
     and "bets" and the value is the action to be realized.
    """
    def get_Z(x, y):
        if (x,y) in policy:
            return policy[x,y]
        else:
            return 25 # this value is to vizualize that there is no action yet defined for this state
             

    def get_figure( ax):
        x_range = np.arange(0, 6)
        y_range = np.arange(0, 11)
        X, Y = np.meshgrid(x_range, y_range)
        Z = np.array([[get_Z(x,y) for x in x_range] for y in y_range])
        surf = ax.imshow(np.flip(Z,0), cmap=plt.get_cmap('Pastel2', 3), vmin=0, vmax=2, extent=[-0.5, 5.5, -0.5, 10.5])
        plt.xticks(x_range)
        plt.yticks(y_range)
        plt.gca().invert_yaxis()
        ax.set_xlabel('heads')
        ax.set_ylabel('bets')
        ax.grid(color='w', linestyle='-', linewidth=1)
        divider = make_axes_locatable(ax)
        cax = divider.append_axes("right", size="5%", pad=0.1)
        cbar = plt.colorbar(surf, ticks=[0,1,2], cax=cax)
        cbar.ax.set_yticklabels(['0 (pass)','1 (up)', 'unknown'])
        print(Z)
            
    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(111)
    get_figure(ax)
    plt.show()

## Monte Carlo

In [0]:
def get_probs(Q_s, epsilon, nA): 
    # Complete
    return policy_s
  

def generate_episode_from_Q(env, Q, epsilon, nA):
    # Complete
    return episode


def update_Q(env, episode, Q, alpha, gamma):
    # Complete
    return Q

In [0]:
def mc_control(env, num_episodes, alpha, gamma=1.0, eps_start=1.0, eps_decay=.99999, eps_min=0.05):
    # Complete
    return policy, Q

In [0]:
# Compute  the optimal policy and value function
policy, Q = mc_control(env, 500000,0.015)
V = dict((k,np.max(v)) for k, v in Q.items())
plot_values(V)

In [0]:
# plot the policy
plot_policy(policy)

## SARSA

In [0]:
def update_Q_sarsa(alpha, gamma, Q, state, action, reward, next_state=None, next_action=None):
    # Complete
    
def epsilon_greedy(Q, state, nA, eps):
    # Complete

In [0]:
def sarsa(env, num_episodes, alpha, gamma=1.0, epsmin=0.01):
    # Complete
    return Q

In [0]:
# Compute  the optimal policy and value function
Q_sarsa = sarsa(env, 500000, 0.009)
V = dict((k,np.max(v)) for k, v in Q_sarsa.items())
plot_values(V)

In [0]:
# plot
policy_sarsa = dict((k,np.argmax(v)) for k, v in Q_sarsa.items())
plot_policy(policy_sarsa)

## Q-learning

In [0]:
def update_Q_sarsamax(alpha, gamma, Q, state, action, reward, next_state=None):
    # Complete
    return new_value

In [0]:
def q_learning(env, num_episodes, alpha, gamma=1.0,epsmin=0.01):
    # Complete
    return Q

In [0]:
# Compute  the optimal policy and value function
Q_sarsamax = q_learning(env, 500000, 0.01)
V = dict((k,np.max(v)) for k, v in Q_sarsamax.items())
plot_values(V)

In [0]:
# plot
policy_sarsamax = dict((k,np.argmax(v)) for k, v in Q_sarsamax.items())
plot_policy(policy_sarsamax)