In [9]:
!pip install gym[classic_control]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pygame==2.1.0
  Downloading pygame-2.1.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[K     |████████████████████████████████| 18.3 MB 94 kB/s 
Installing collected packages: pygame
Successfully installed pygame-2.1.0


In [16]:
import gym
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import math
from typing import Tuple

In [17]:
env = gym.make('CartPole-v1', new_step_api = True)

### Discretizer

In [35]:
n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

def discretizer( _ , __ , angle, pole_velocity ):
    """Convert continues state intro a discrete state"""
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

### Policy evaluation
For a given policy, we iterate until we find the associated value function

### Doubts:
- Do we require the `V = np.zeros` initialization? We can reuse computation from prev values --> faster convergence?
- `env.state = S[state]` should be inside the for loop right. Else the action might be taken from some other state
- Also, are we using `env` or `environment

In [19]:
def policy_evaluation(policy, environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
    # Number of evaluation iterations
    evaluation_iterations = 1
    # Initialize a value function for each state as zero
    V = np.zeros(n_bins)
    # Repeat until change in value is below the threshold
    for i in range(int(max_iterations)):
        # Initialize a change of value function as zero
        delta = 0
        # Iterate though each state
        for angle in range(n_bins[0]):
            for velocity in range(n_bins[1]):
                # Here we will accumulate the expected value
                v = 0

                # Try all possible actions which can be taken from this state
                state = (angle, velocity)

                # We construct this from the reverse mapping
                env.state = S[state]

                # action is given by the index
                for action, action_probability in enumerate(policy[state]):
                    # Check how good next state will be
                    obs, reward, terminated, _, _ = environment.step(action)
                    next_state = discretizer(*obs)

                    #  print(environment.step(action))
                    #  print(f"ns={next_state}, r={reward}, t={terminated}")

                    v += action_probability * P[(*state, action, *next_state)] * (R[(*state, action)] + discount_factor * V[next_state])
                    if terminated:
                        environment.reset()
                        break

                    #  for obs, reward, terminated, _ in environment.step(action):
                    #       # Calculate the expected value
                    #       print(*environment.step(action))
                    #       print(obs, reward, terminated)
                    #       next_state = discretizer(*obs)
                    #       print(environment.step(action))
                    #       print(f"ns={next_state}, r={reward}, t={terminated}")
                    #       v += action_probability * P[state+ (action, ) +next_state] * (R[state+ (action, )] + discount_factor * V[next_state]) # TODO: Add state probability computation later

               # Calculate the absolute change of value function
               delta = max(delta, np.abs(V[state] - v))
               # Update value function
               V[state] = v
        evaluation_iterations += 1

        # Terminate if value change is insignificant
        if delta < theta:
                print(f'Policy evaluated in {evaluation_iterations} iterations.')
                return V

### One step look ahead for choosing the next best action from a state in a greedy member if required

In [30]:
def one_step_lookahead(environment, state, V, discount_factor):
    """
    Function computes the action values for different actions
    From our state, if we take an action, how is it gonna add up to our returns
    """
    action_values = np.zeros(environment.action_space.n)
    for action in range(environment.action_space.n):
            for angle in range(n_bins[0]):
                for velocity in range(n_bins[1]):
                    next_state = (angle, velocity)
                    action_values[action] += P[(*state, action, *next_state)] + (R[(*state, action)] + discount_factor * V[next_state])
    return action_values

### Policy iteration

- `stable_policy` has to be made `False` right?
- `>>> np.eye(4)[2]`
- `array([0., 0., 1., 0.])`
- Seems problematic!

In [21]:
def policy_iteration(environment, discount_factor=1.0, max_iterations=1e9):
    # Start with a uniform policy
    policy = np.ones([*n_bins, environment.action_space.n]) / environment.action_space.n
    # so policy[state] will have shape (action_space.n,)
    # Initialize counter of evaluated policies
    evaluated_policies = 1
    # Repeat until convergence or critical number of iterations reached
    for i in range(int(max_iterations)):
        stable_policy = True
        # Evaluate current policy
        V = policy_evaluation(policy, environment, discount_factor=discount_factor)
        
        # Go through each state and try to improve actions that were taken (policy Improvement)
        for angle in range(n_bins[0]):
            for velocity in range(n_bins[1]):
                # Choose the best action in a current state under current policy
                state = (angle, velocity)
                current_action = np.argmax(policy[state]) #TODO
                # Look one step ahead and evaluate if current action is optimal
                # We will try every possible action in a current state
                action_value = one_step_lookahead(environment, state, V, discount_factor)
                # Select a better action
                best_action = np.argmax(action_value)
                # If action didn't change
                if current_action != best_action:
                    stable_policy = False
                    # Greedy policy update
                    policy[state] = np.eye(environment.action_space.n)[best_action]
        
        evaluated_policies += 1
        
        # If the algorithm converged and policy is not changing anymore, then return final policy and value function
        if stable_policy:
            print(f'Evaluated {evaluated_policies} policies.')
            return policy, V

### The value iteration algorithm

In [33]:
def value_iteration(environment, discount_factor=1.0, theta=1e-9, max_iterations=1e9):
        # Initialize state-value function with zeros for each environment state
        V = np.zeros(n_bins)
        for i in range(int(max_iterations)):
            # Early stopping condition
            delta = 0
            # Update each state
            for angle in range(n_bins[0]):
                for velocity in range(n_bins[1]):
                    state = (angle, velocity)
                    # Do a one-step lookahead to calculate state-action values
                    action_value = one_step_lookahead(environment, state, V, discount_factor)
                    # Select best action to perform based on the highest state-action value
                    best_action_value = np.max(action_value)
                    # Calculate change in value
                    delta = max(delta, np.abs(V[state] - best_action_value))
                    # Update the value function for current state
                    V[state] = best_action_value
                    # Check if we can stop
            if delta < theta:
                    print(f'Value-iteration converged at iteration#{i}.')
                    break

        # Create a deterministic policy using the optimal value function
        policy = np.zeros([*n_bins, environment.action_space.n])
        for angle in range(n_bins[0]):
            for velocity in range(n_bins[1]):
                state = (angle, velocity)
                # One step lookahead to find the best action for this state
                action_value = one_step_lookahead(environment, state, V, discount_factor)
                # Select best action based on the highest state-action value
                best_action = np.argmax(action_value)
                # Update the policy to perform a better action at a current state
                policy[(*state, best_action)] = 1.0
                
        return policy, V

### Episodes (essentially the test set)

- `Win` doesnt make sense here

In [34]:
def play_episodes(environment, n_episodes, policy):
    wins = 0
    total_reward = 0
    for episode in range(n_episodes):
        terminated = False
        state = environment.reset()
        state = discretizer(*state)
        while not terminated:
            # Select best action to perform in a current state
            action = np.argmax(policy[state])
            # Perform an action an observe how environment acted in response
            next_state, reward, terminated, info, _ = environment.step(action)
            # Summarize total reward
            total_reward += reward
            # Update current state
            next_state = discretizer(*next_state)
            state = next_state

    average_reward = total_reward / n_episodes
    return total_reward, average_reward

# Number of episodes to play
n_episodes = 10000
# Functions to find best policy
solvers = [('Policy Iteration', policy_iteration),
           ('Value Iteration', value_iteration)]
for iteration_name, iteration_func in solvers:
        # Load a Frozen Lake environment
        environment = gym.make('CartPole-v1', render_mode="rgb_array", new_step_api=True)
        # Search for an optimal policy using policy iteration
        environment.reset()
        policy, V = iteration_func(environment.env)
        # Apply best policy to the real environment
        total_reward, average_reward = play_episodes(environment, n_episodes, policy)
        # print(f'{iteration_name} :: number of wins over {n_episodes} episodes = {wins}')
        print(f'{iteration_name} :: average reward over {n_episodes} episodes = {average_reward} \n\n')

Policy evaluated in 15 iterations.
Evaluated 2 policies.
Policy Iteration :: average reward over 10000 episodes = 23.4893 




  action_values[action] += P[(*state, action, *next_state)] + (R[(*state, action)] + discount_factor * V[next_state])
  delta = max(delta, np.abs(V[state] - best_action_value))


Value-iteration converged at iteration#16.
Value Iteration :: average reward over 10000 episodes = 9.3473 




In [None]:
?env.P

In [None]:
env = gym.make('CartPole-v1', new_step_api = True)

In [None]:
?env.reset

### Here we generate random episodes to approximate model dynamics and reward functions

In [28]:
P = np.zeros((*n_bins, env.action_space.n, *n_bins))
N = np.zeros((*n_bins, env.action_space.n))
R = np.zeros((*n_bins, env.action_space.n))
S = np.zeros((*n_bins, env.observation_space.shape[0]))

MAX_ITER = 10
DISCOUNT = 0.5
# MAX_ITER = 10000

for _ in range(MAX_ITER):
    cur_c = env.reset()
    cur = discretizer(*cur_c)
    S[cur] += cur_c
    S[cur] *= DISCOUNT
    
    done = False
    while not done:
        action = np.random.randint(2)
        # print(action)
        # print(env.step(action))
        obs, reward, done, _, _ = env.step(action)
        next_state = discretizer(*obs)
        S[next_state] += obs
        S[next_state] *= DISCOUNT
        P[(*cur, action, *next_state)] += 1
        N[(*cur, action)] += 1
        if not done:
            R[(*cur, action)] += 1
        cur = next_state

In [None]:
N = N + 1e-10
R / N

array([[[0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ],
        [0.        , 0.        ]],

       [[0.59964727, 0.61339422],
        [0.80952381, 0.9375    ],
        [0.95744681, 0.9245283 ],
        [0.88636364, 0.93877551],
        [0.97619048, 0.94444444],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ]],

       [[1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1.        , 1.        ],
        [1

In [12]:
R = R / N

for angle in range(n_bins[0]):
  for velocity in range(n_bins[1]):
    for action in range(env.action_space.n):
      for angle2 in range(n_bins[0]):
        for velocity2 in range(n_bins[1]):
          P[(angle, velocity)+ (action, ) +(angle2, velocity2)] /= N[(angle, velocity)+ (action, )]

  """Entry point for launching an IPython kernel.
  


In [None]:
print(P)

[[[[[0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]]

   [[0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0.         0.
     0.        ]]]


  [[[0.         0.         0.         ... 0.         0.
     0.        ]
    [0.         0.         0.         ... 0. 

In [None]:
?env.state

In [None]:
S

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.07261572,  0.26997742, -0.18465293, -0.93597931],
        [ 0.02637376, -0.0463771 , -0.19736523, -0.66543112],
        [ 0.048996  , -0.02417257, -0.18019339, -0.49810254],
        [ 0.06466052, -0.1098226 , -0.17488677, -0.35261735],
      