<a href="https://colab.research.google.com/github/Chia-Yin-Lee/GridWorld-continuous-state-space/blob/main/A2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title connect google drive folder

from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/SMU_MITB_RL

Mounted at /content/drive
/content/drive/MyDrive/SMU_MITB_RL


In [None]:
#@title import packages

import numpy as np
from tqdm import tqdm
from copy import deepcopy
import math
import time
import os
import random
import imageio

from tensorflow.keras.utils import to_categorical
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.optimizers import Adam

from collections import deque
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as patches
import matplotlib.colors as mcolors
from matplotlib.cm import ScalarMappable
from PIL import Image, ImageChops

### Question 1

### Q1(a) Do you like my code?
Yes
### Q1(b) Modify the given environment according to this problem.

Suppose we want to modify the environment to deal with a gridworld that is no longer formed
by grids. Specifically, it has the following aspects:
- The 'world' has a continuous space, spanning from coordinates 0 to 10 along both the horizontal and vertical dimensions.
- The agent moves a distance of exactly 1 unit each step, which can be in any direction in increments of 10 degrees. Therefore, there is a total of 36 possible actions, unlike previously where only four directions (0/90/180/270 degrees) had been considered.
- There is no wind or cliff. The agent cannot go outside of the gridworld; clip the position to deal with this.
- Movement is deterministic and there is no slip.
- The agent starts at position [2, 2].
- Anywhere within the region [7, 7] to [8, 8] is the goal, for which the agent receives a reward of +100 and the episode terminates. For example, landing in positions [7.2, 7.7] or [7.9, 7.3] constitute to landing in the goal.
- Anywhere within the region [2, 4] to [8, 5] is the trap, for which the agent receives a reward of -50. If the agent keeps walking within these 6 square units of trap region, it receives the negative reward each time, regardless of whether the steps are successive.
- All moves which does not end in the goal or trap region will have a reward of -1.

Modify the given environment according to this problem. In addition, modify the agent such that Q is estimated from a functional approximator with three hidden layers, with an appropriate number of nodes in each layer. **The structure of `SlipperyWindyCliffGridWorld`, eg. the names of each method and each of the inputs and outputs, must remain the same.** You may create the functional approximator as a class of its own, expand an existing class (with new methods if necessary), or in any way you deem fit.

In [None]:
#@title Modified environment

class World:
    def __init__(self):

        self.max_row = 10
        self.max_col = 10
        self.initial_pos = [2, 2]

        # no wind, no cliff, no slip
        self.reset()

    def reset(self):
        # Use exploring starts by default
        self.is_done = False
        self.cur_state = deepcopy(self.initial_pos)
        return self.cur_state

    def _get_state_dim(self):
        # return np.array(self.grid).shape
        return np.array([10])

    def _get_action_dim(self):
        return np.array([36])  # 36 actions

    def _get_next_state(self, x, y, action_idx, distance=1):
        angle_radians = math.radians(action_idx * 10)

        delta_x = distance * math.cos(angle_radians)
        delta_y = distance * math.sin(angle_radians)

        new_x = x + delta_x
        new_y = y + delta_y
        return new_x, new_y

    def transition(self, state, action):
        if self.is_done:
            return 0, state, True

        next_state = deepcopy(state)

        action = np.random.choice(36)  # 36 actions

        # action idx -> next state
        next_state = self._get_next_state(x = state[0], y = state[1], action_idx = action)

        next_state = np.clip(next_state, [0, 0], [self.max_row, self.max_col]).tolist()

        row, col = next_state

        if row >= 2 and row <= 8 and col >= 4 and col <= 5:
            reward = -50  # trap
            self.is_done = False
        elif row >= 7 and row <= 8 and col >= 7 and col <= 8:
            reward = 100   # goal
            self.is_done = True
        else:
            reward = -1
            self.is_done = False

        return reward, next_state, self.is_done

In [None]:
#@title Play a sample episode

env = World()

episode = []

state = env.reset()
done = False

while not done:

    action = np.random.choice(36)  # randomly choose an action
    reward, next_state, done = env.transition(deepcopy(state), action)
    episode.append(deepcopy((state, action, reward)))
    state = next_state

print(episode)

[([2, 2], 31, -1), ([1.0603073792140916, 1.6579798566743313], 17, -1), ([1.9263327829985304, 2.157979856674331], 10, -1), ([2.6923772261175083, 2.8007674663608704], 25, -1), ([3.3351648358040475, 2.0347230232418925], 11, -1), ([3.977952445490587, 1.2686785801229143], 13, -1), ([4.843977849275025, 0.7686785801229139], 30, -1), ([3.977952445490587, 0.2686785801229138], 25, -1), ([3.111927041706148, 0.7686785801229137], 16, -1), ([4.051619662492056, 1.1106987234485823], 9, -1), ([4.2252678401589865, 0.1258909704363742], 29, -1), ([3.4592233970400086, 0.0], 35, -1), ([2.8164357873534693, 0.766044443118978], 26, -1), ([3.5824802304724472, 1.4088320528055172], 9, -1), ([3.4088320528055167, 2.3936398058177253], 35, -1), ([2.642787609686539, 3.0364274155042645], 24, -1), ([3.642787609686539, 3.0364274155042645], 17, -1), ([3.816435787353469, 2.0516196624920564], 4, -1), ([3.4744156440278, 2.991312283277965], 11, -1), ([3.9744156440278, 3.8573376870624037], 20, -50), ([4.914108264813708, 4.1993

In [None]:
#@title Modified agent, including training

class TemporalDifference:
    def __init__(self, Env, alpha=0.1, gamma=0.9, epsilon=0.1, lambd=0.9):
        self.Env = Env
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.lambd = lambd

        self.state_dim = self.Env._get_state_dim()
        self.action_dim = self.Env._get_action_dim()[0]

        # Initialize the neural network for Q approximation
        self.model = self._build_model()

    def _build_model(self):
        inputs = Input(shape=(2,))  # Two inputs for x and y coordinates
        x = Dense(24, activation='relu')(inputs)
        x = Dense(24, activation='relu')(x)
        x = Dense(24, activation='relu')(x)
        outputs = Dense(self.action_dim, activation='linear')(x)  # One output per action
        model = Model(inputs=inputs, outputs=outputs)
        model.compile(optimizer=Adam(learning_rate=self.alpha), loss='mse')
        return model

    def epsilon_greedy_policy(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.action_dim)  # Explore
        else:
            q_values = self.model.predict(np.array([state]))[0]  # Exploit
            return np.argmax(q_values)

    def train(self, num_episodes, on_policy=True):
        start_time = time.time()  # Record the start time
        rewards_per_episode = []  # For visualization

        for _ in tqdm(range(num_episodes)):
            episode_reward = 0  # Initialize reward for the episode
            state = self.Env.reset()
            #done = False
            action = self.epsilon_greedy_policy(state)

            while not self.Env.is_done:
                #action = self.epsilon_greedy_policy(state)
                reward, next_state, done = self.Env.transition(state, action)
                episode_reward += reward

                target = reward
                if not self.Env.is_done:
                    next_q_values = self.model.predict(np.array([next_state]))[0]
                    if on_policy:
                        next_action = self.epsilon_greedy_policy(next_state)
                        target += self.gamma * next_q_values[next_action]
                    else:
                        target += self.gamma * np.max(next_q_values)

                target_vec = self.model.predict(np.array([state]))[0]
                target_vec[action] = target

                self.model.fit(np.array([state]), np.array([target_vec]), epochs=1, verbose=0)

                state = next_state

            rewards_per_episode.append(episode_reward)

        end_time = time.time()  # Record the end time
        total_time = end_time - start_time  # Calculate the total training time

        return rewards_per_episode, total_time

In [None]:
#@title Functions for visualization

def plot_value_function(agent, env, resolution=20):
    x = np.linspace(0, env.max_row, resolution)
    y = np.linspace(0, env.max_col, resolution)
    X, Y = np.meshgrid(x, y)
    Z = np.zeros_like(X)

    for i in range(resolution):
        for j in range(resolution):
            state = np.array([X[i, j], Y[i, j]])
            q_values = agent.model.predict(state.reshape(1, -1))[0]
            Z[i, j] = np.max(q_values)  # For the value function

    plt.contourf(X, Y, Z, levels=50, cmap='viridis')
    plt.colorbar()
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.title('Value Function')
    plt.grid(True)
    plt.show()

def plot_policy(agent, env, resolution = 20):
    '''
    For each point in a discretized version of the state space,
    we can plot the action that the policy would choose.
    This can be represented using arrows showing the direction the agent would take from different points.
    '''
    x = np.linspace(0, env.max_row, resolution)
    y = np.linspace(0, env.max_col, resolution)
    X, Y = np.meshgrid(x, y)

    U, V = np.zeros_like(X), np.zeros_like(Y)

    for i in range(resolution):
        for j in range(resolution):
            state = np.array([X[i, j], Y[i, j]])
            action = agent.epsilon_greedy_policy(state)
            # Convert the action into a change in x and y (U, V)
            angle = action * 10  # Assuming each action corresponds to a 10-degree increment
            U[i, j] = np.cos(np.radians(angle))
            V[i, j] = np.sin(np.radians(angle))

    plt.figure(figsize=(8, 6))
    plt.quiver(X, Y, U, V, pivot='mid')
    plt.xlabel('X Position')
    plt.ylabel('Y Position')
    plt.title('Policy Visualization')
    plt.grid(True)
    plt.show()

In [None]:
#@title Agent 1

env = World()
n = 50

# Case 1: Monte Carlo
agent1 = TemporalDifference(
    env, alpha=0.1, gamma=0.9, epsilon=0.1, lambd=1
)
reward1, total_time1 = agent1.train(num_episodes=n)

# After training your agent:
# plot_value_function(agent1, env)

plt.plot(reward1)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Agent1 Learning Over Time')
plt.show()

In [None]:
plot_policy(agent1, env)

In [None]:
#@title Agent 2

env = World()
n = 50

## Case 2: SARSA
agent2 = TemporalDifference(
    env, alpha=0.1, gamma=0.9, epsilon=0.1, lambd=0
)
reward2, total_time2 = agent2.train(num_episodes=n)

# plot_value_function(agent2, env)

plt.plot(reward2)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Agent2 Learning Over Time')
plt.show()

In [None]:
plot_policy(agent2, env)

In [None]:
#@title Agent 3

env = World()
n = 50

## Case 3: Q-Learning
agent3 = TemporalDifference(
    env, alpha=0.1, gamma=0.9, epsilon=0.1, lambd=0
)
reward3, total_time3 = agent3.train(num_episodes=n, on_policy=False)

plt.plot(reward3)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Agent3 Learning Over Time')
plt.show()

In [None]:
plot_policy(agent3, env)

In [None]:
#@title Agent 4

env = World()
n = 50

## Case 4: General TD(λ)
agent4 = TemporalDifference(
    env, alpha=0.1, gamma=0.9, epsilon=0.1, lambd=0.5
)
reward4, total_time4 = agent4.train(num_episodes=n)

# plot_value_function(agent4, env)

plt.plot(reward4)
plt.xlabel('Episode')
plt.ylabel('Cumulative Reward')
plt.title('Agent4 Learning Over Time')
plt.show()

In [None]:
plot_policy(agent4, env)

In [None]:
print(total_time1, total_time4)

4789.866044521332 5033.943603038788


In [None]:
print(total_time2, total_time3)

4144.591289520264 3684.470736026764
