## Change in Behavior and Transfer with Change in Properties of the Environment

Team Members
- Aditya Anantwar 19CS10006
- Abhinandan De 19CS10069

### DP implementation on CartPole-v1

In [None]:
!pip install gym[classic_control]

In [None]:
%reset -f array

In [None]:
import gym
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer
import math
from typing import Tuple

### Description

#### Action space

The action is an `ndarray` with shape `(1,)` which can take values {0, 1} indicating the direction of the fixed force the cart is pushed with.

#### State Space

The observation is an `ndarray` with shape `(4,)` with the values corresponding to the following positions and velocities:

| Num |      Observation      |         Min         |        Max        |   |
|:---:|:---------------------:|:-------------------:|:-----------------:|---|
| 0   | Cart Position         | -4.8                | 4.8               |   |
| 1   | Cart Velocity         | -Inf                | Inf               |   |
| 2   | Pole Angle            | ~ -0.418 rad (-24°) | ~ 0.418 rad (24°) |   |
| 3   | Pole Angular Velocity | -Inf                | Inf               |   |

#### Rewards

Since the goal is to keep the pole upright for as long as possible, a reward of `+1` for every step taken, including the termination step, is allotted. The threshold for rewards is `475` for v1.

#### Starting State

All observations are assigned a uniformly random value in (-0.05, 0.05)

#### Episode

The episode ends if any one of the following occurs:

- `Termination`: Pole Angle is greater than ±12°</li>
- `Termination`: Cart Position is greater than ±2.4 (center of the cart reaches the edge of the display)</li>
- `Truncation`: Episode length is greater than 500 (200 for v0)</li>

In [None]:
n_bins = (6, 2, 12, 12)
env = gym.make('CartPole-v1', new_step_api = True)

In [None]:
lower_bounds = [ env.observation_space.low[0], -3.5, env.observation_space.low[2], -3.5 ]
upper_bounds = [ env.observation_space.high[0], 3.5, env.observation_space.high[2], 3.5 ]
est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
est.fit([lower_bounds, upper_bounds ])

def discretizer( position , velocity , angle, pole_velocity):
  """Convert continues state intro a discrete state"""  
  return tuple(map(int,est.transform([[position, velocity, angle, pole_velocity]])[0]))

In [None]:
def policy_evaluation(environment, policy, P, R, discount_factor = 1.0, theta = 1e-1, max_iterations = 1e9):
  eval_iters = 1 # Evaluation iterations

  V = np.zeros(n_bins)  # Value function array

  # Repeat till change in value function reaches threshold
  for i in range(int(max_iterations)):
    delta = 0 #Initialize change in value function to 0

    # Iterate through each state
    for position in range(n_bins[0]):
      for velocity in range(n_bins[1]):
        for angle in range(n_bins[2]):
          for pole_velocity in range(n_bins[3]):
            v = 0 # Accumulate expected value here

            state = (position, velocity, angle, pole_velocity) # state for this iteration

            for action, action_probability in enumerate(policy[state]):
              # environment.state = S[state] # Initialize state of environment to pre-computed reverse map state value

              #check how good next state will be
              for position2 in range(n_bins[0]):
                for velocity2 in range(n_bins[1]):
                  for angle2 in range(n_bins[2]):
                    for pole_velocity2 in range(n_bins[3]): 
                      next_state = (position2, velocity2, angle2, pole_velocity2)
                      v += action_probability * P[(*state, action, *next_state)] * (R[(*state, action)] + discount_factor * V[next_state])
            
            delta = max(delta, np.abs(V[state] - v)) # absolute change of value function

            V[state] = v  # update value function
    
    eval_iters += 1
    print(f"iteration: {eval_iters}, delta = {delta}")

    # Terminate if value change less than delta
    if delta < theta:
      print(f'Policy evaluated in {eval_iters} iterations.')
      return V
  return V

In [None]:
def one_step_lookahead(environment, state, V, P, R, discount_factor):
  """
  Function computes the action values for different actions
  From our state, if we take an action, how is it gonna add up to our returns
  """
  action_values = np.zeros(environment.action_space.n)
  for action in range(environment.action_space.n):
    for position in range(n_bins[0]):
      for velocity in range(n_bins[1]):
        for angle in range(n_bins[2]):
          for pole_velocity in range(n_bins[3]):
            next_state = (position, velocity, angle, pole_velocity)
            action_values[action] += P[(*state, action, *next_state)] * (R[(*state, action)] + discount_factor * V[next_state])
  return action_values

In [None]:
def policy_iteration(environment, P, R, discount_factor = 1.0, max_iterations = 1e9):
  # start with a uniform policy
  policy = np.ones([*n_bins, environment.action_space.n]) / environment.action_space.n
  # Initialze counter of evaluated policies
  eval_policies = 1

  # Repeat until convergence or critical number of iterations reached
  for i in range(int(max_iterations)):
    stable_policy = True
    #Evaluate current policy
    V = policy_evaluation(environment, policy, P, R, discount_factor = discount_factor, max_iterations = max_iterations)

    # Go through each state and try to improve actions that were taken (policy improvement)
    for position in range(n_bins[0]):
      for velocity in range(n_bins[1]):
        for angle in range(n_bins[2]):
          for pole_velocity in range(n_bins[3]):
            # Choose best action for current state
            state = (position, velocity, angle, pole_velocity)

            current_action = np.argmax(policy[state])

            # Look one step ahead and evaluate whether the current action is best
            action_value = one_step_lookahead(environment, state, V, P, R, discount_factor)

            # Select better action
            best_action = np.argmax(action_value)

            # If action changes
            if current_action != best_action:
              stable_policy = False
              # Greedy policy update
              policy[state] = np.eye(environment.action_space.n)[best_action]

    eval_policies += 1

    # If the algorithm converged and policy is not changing anymore, then return
    if stable_policy:
      print(f'Evaluate {eval_policies} policies.')
      return policy, V
  return policy, V

In [None]:
def value_iteration(environment, P, R, discount_factor = 1.0, theta = 1e-1, max_iterations = 1e9):
  # Initialize state-value function with zeros for each environment state
  V = np.zeros(n_bins)
  for i in range(int(max_iterations)):
    # Stopping condition
    delta = 0
    # Update ach state
    for position in range(n_bins[0]):
      for velocity in range(n_bins[1]):
        for angle in range(n_bins[2]):
          for pole_velocity in range(n_bins[3]):
            state = (position, velocity, angle, pole_velocity)
            # One ste lookahead to calculate state-action values
            action_value = one_step_lookahead(environment, state, V, P, R, discount_factor)

            # Select best action to perform based on the highest state-action values
            best_action_value = np.max(action_value)

            # Calculate change
            delta = max(delta, np.abs(V[state] - best_action_value))

            # Update the value function for current state
            V[state] = best_action_value
    
    # Check if stopping condition:
    if delta < theta:
      print(f'Value-iteration converged at iterations {i}.')
      break

  # Create a deteministic policy using the optimal value function
  policy = np.zeros([*n_bins, environment.action_space.n])

  for position in range(n_bins[0]):
    for velocity in range(n_bins[1]):
      for angle in range(n_bins[2]):
        for pole_velocity in range(n_bins[3]):
          state = (position, velocity, angle, pole_velocity)
          # One step lookeahead to find the best action for this state
          action_value = one_step_lookahead(environment, state, V, P, R, discount_factor)

          # Select best action based on the highest state-action value
          best_action = np.argmax(action_value)

          # Update the policy to perform a better action at a current state
          policy[(*state, best_action)] = 1
  
  return policy, V

In [None]:
def get_variables(env):  
  P = np.zeros((*n_bins, env.action_space.n, *n_bins))
  N = np.zeros((*n_bins, env.action_space.n))
  R = np.zeros((*n_bins, env.action_space.n))

  MAX_ITER = 1000
  DISCOUNT = 0.5

  for _ in range(MAX_ITER):
    cur_c = env.reset()
    cur = discretizer(*cur_c)
    
    done = False
    while not done:
      action = np.random.randint(2)
      obs, reward, done, _, _ = env.step(action)
      next_state = discretizer(*obs)
      P[(*cur, action, *next_state)] += 1
      N[(*cur, action)] += 1
      if not done:
        R[(*cur, action)] += 1
      cur = next_state
  N += 1e-9
  R = R / N

  P = P / N.reshape(*n_bins, env.action_space.n, 1, 1, 1, 1)

  return P, N, R

In [None]:
def play_episodes(environment, n_episodes, policy):
  total_reward = 0
  for episodes in range(n_episodes):
    terminated = False
    state = environment.reset()
    state = discretizer(*state)
    while not terminated:
      # Select best action to perform in current state
      action = np.argmax(policy[state])

      # Perform an action and observe how environment acted in response
      next_state, reward, terminated, info, _ = environment.step(action)

      total_reward += reward

      # Update current state
      next_state = discretizer(*next_state)
      state = next_state
  
  average_reward = total_reward / n_episodes
  return total_reward, average_reward

In [None]:
def run_model(environment, max_iterations, test_episodes):
  # Number of episodes
  N_EPISODES = 10000
  # Function to find best policy
  solvers = [('Policy Iteration', policy_iteration),
            ('Value Iteration', value_iteration)]

  P, N, R = get_variables(environment)

  rewards = {'Policy Iteration': 0,
            'Value Iteration': 0}

  for iteration_name, iteration_function in solvers:
    environment.reset()
    policy, V = iteration_function(environment, P, R, max_iterations = max_iterations)

    # Apply best policy
    total_reward, average_reward = play_episodes(environment, N_EPISODES, policy)

    rewards[iteration_name] = average_reward

    print(f'{iteration_name} :: average reward over {N_EPISODES} episodes = {average_reward} \n\n')
  
  return rewards

In [None]:
MAX_ITERS = 10
N_TEST_EPISODES = 1000
PLOT = True

attributes = ["gravity", "masscart", "masspole", "length", "force_mag"]

environment = gym.make('CartPole-v1', new_step_api = True)

for attr in attributes:
    print(f"Tweaking {attr}")
    
    # obtain original value
    orig_value = getattr(environment, attr)
    
    all_test_rewards = []
    
    new_values = np.linspace(0, 2 * orig_value, 5)
    new_values = np.round(new_values, decimals = 2)
    
    for idx, value in enumerate(new_values):
        setattr(environment, attr, value)
        test_rewards = run_model(environment, MAX_ITERS, N_TEST_EPISODES)
        
        all_test_rewards.append(test_rewards)
    
    with open('rewards.txt', 'a') as f:
      for item in all_test_rewards:
        for key, value in item.items():
          f.write(f'{attr} {key} {value}\n')
        
    # reset to original value
    setattr(environment, attr, orig_value)