In [1]:
import numpy as np
import gym
import math
from random import randint
import random

In [2]:
env = gym.make('MountainCar-v0')

In [None]:
env.action_space.n

3

In [None]:
env.observation_space.high

array([0.6 , 0.07], dtype=float32)

In [None]:
env.observation_space.low

array([-1.2 , -0.07], dtype=float32)

# Exercise 1: Prelims

In [None]:
class MC():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.c = np.zeros((10,10,3))
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []
    
  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state
  
  def act(self,s):
    a = np.argmax(self.q[s])
    return a
  
  def e_greedy(self,s):
    if np.random.random() < self.e:
      a = env.action_space.sample()
    else:
      a = np.argmax(self.q[s])
    return a

  def mc_sampling(self, ini_s, episode):
    G = 0
    records = []
    all_state_action = []
    obs = ini_s
    while True:
      s = tuple(self.get_state(obs))
      a = self.e_greedy(s)
      obs, r, done, _ = env.step(a)
      G += r
      records.append([s,a,r])
      all_state_action.append([s,a])
      if done:
        self.history.append(G)
        back_G = 0
        for i in range(len(records))[::-1]:
          record = records[i]
          back_G += record[2]
          if not all_state_action[i] in all_state_action[:i]:
            self.update(record[0],record[1],back_G)
        break

  def update(self,s,a,G):
    self.q[s][a] = (self.q[s][a] * self.c[s][a] + G) / (self.c[s][a] + 1)
    self.c[s][a] += 1


In [None]:
agent = MC()
for epicode_count in range(500000):
  obs = env.reset()
  agent.mc_sampling(obs,epicode_count)

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("MC | average score: {}".format(np.mean(test)))

MC | average score: -120.4


# Task 1: TD(0)

In [None]:
class SARSA():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,s,a,s1,a1,r,lr):
    self.q[s][a] += lr * (r + self.gamma * self.q[s1][a1] - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])

In [None]:
n_episodes = 1000
agent = SARSA()

In [None]:
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  G = 0
  #for t in range(500):
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a1 = agent.e_greedy(s1,e)
    agent.update(s,a,s1,a1,r,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("SARSA | average score: {}".format(np.mean(test)))

SARSA | average score: -165.2


In [None]:
class Expect_SARSA():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,r,lr):
    greedy_a = np.argmax(self.q[s1])
    prob_greedy = 1 - e + e/3
    prob_non_greedy = e/3
    expect = 0
    for action in range(env.action_space.n):
      if action == greedy_a:
        expect += prob_greedy * self.q[s1][action]
      else:
        expect += prob_non_greedy * self.q[s1][action]
    self.q[s][a] += lr * (r + self.gamma * expect - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])

In [None]:
n_episodes = 1000
agent = Expect_SARSA()

In [None]:
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  G = 0
  #for t in range(500):
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a1 = agent.e_greedy(s1,e)
    agent.update(e,s,a,s1,r,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("Expected SARSA | average score: {}".format(np.mean(test)))

Expected SARSA | average score: -162.5


In [None]:
class off_Qlearn():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def e_greedy(self, state):
    if np.random.random() < self.e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,s,a,s1,r,lr):
    best_action = np.argmax(self.q[s1])
    self.q[s][a] += lr * (r + self.gamma * self.q[s1][best_action] - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])

In [None]:
n_episodes = 1000
agent = off_Qlearn()

In [None]:
for episodas in range(n_episodes):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s)
  G = 0
  lr = agent.lr[episodas]
  #for t in range(500):
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a1 = agent.e_greedy(s1)
    agent.update(s,a,s1,r,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("off Qlearn | average score: {}".format(np.mean(test)))

off Qlearn | average score: -159.0


# Task 2: TD(2), TD(3), TD(4)

In [None]:
class TD2():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,s,a,s2,a2,r,r1,lr):
    self.q[s][a] += lr * (r + r1 + self.gamma * self.q[s2][a2] - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])    

In [None]:
n_episodes = 1000
agent = TD2()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  #for t in range(500):
  while True:
    obs, r1, done, _ = env.step(a1)
    G += r1
    s2 = tuple(agent.get_state(obs))
    a2 = agent.e_greedy(s2,e)
    agent.update(s,a,s2,a2,r,r1,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    r = r1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD2 | average score: {}".format(np.mean(test)))

TD2 | average score: -143.5


In [None]:
class TD3():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,s,a,s3,a3,r,r1,r2,lr):
    self.q[s][a] += lr * (r + r1 + r2 + self.gamma * self.q[s3][a3] - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])    

In [None]:
n_episodes = 1000
agent = TD3()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  #for t in range(500):
  while True:
    obs, r2, done, _ = env.step(a2)
    G += r2
    s3 = tuple(agent.get_state(obs))
    a3 = agent.e_greedy(s3,e)
    agent.update(s,a,s3,a3,r,r1,r2,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    r = r1
    r1 = r2


Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD3 | average score: {}".format(np.mean(test)))

TD3 | average score: -121.5


In [None]:
class TD4():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,s,a,s4,a4,r,r1,r2,r3,lr):
    self.q[s][a] += lr * (r + r1 + r2 + r3 + self.gamma * self.q[s4][a4] - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state]) 

In [None]:
n_episodes = 1000
agent = TD4()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  obs, r2, done, _ = env.step(a2)
  G += r2
  s3 = tuple(agent.get_state(obs))
  a3 = agent.e_greedy(s3,e)
  #for t in range(500):
  while True:
    obs, r3, done, _ = env.step(a3)
    G += r3
    s4 = tuple(agent.get_state(obs))
    a4 = agent.e_greedy(s4,e)
    agent.update(s,a,s4,a4,r,r1,r2,r3,lr)
    if done:
      agent.history.append(G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    s3,a3 = s4,a4
    r = r1
    r1 = r2
    r2 = r3


Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD4 | average score: {}".format(np.mean(test)))

TD4 | average score: -145.6


# Comments on comparison

Since training processes of all different approaches are stochastic, with 1000 eposides training, the performance of all TD(0) and TD(n) based agents during the greedy policy control tests were not stable (final return of greedy policy test varies in diffenent trials). Therefore, no significant improvement of specific approach can be confirmed.

# Bonus Task 3: Tree Backup

In [None]:
class TD2_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,r,r1,lr):
    G = r1
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s2]):
        G += (1-e+e/3) * self.q[s2][action]
      else:
        G += (e/3) * self.q[s2][action]
    if a1 == np.argmax(self.q[s1]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += (1-e+e/3) * self.q[s1][action]
        else:
          G += (e/3) * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])  

In [None]:
n_episodes = 1000
agent = TD2_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  #for t in range(500):
  while True:
    obs, r1, done, _ = env.step(a1)
    total_G += r1
    s2 = tuple(agent.get_state(obs))
    a2 = agent.e_greedy(s2,e)
    agent.update(e,s,a,s1,a1,s2,r,r1,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    r = r1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD2_Tree | average score: {}".format(np.mean(test)))

TD2_Tree | average score: -125.3


In [None]:
class TD3_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,a2,s3,r,r1,r2,lr):
    G = r2
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s3]):
        G += (1-e+e/3) * self.q[s3][action]
      else:
        G += (e/3) * self.q[s3][action]
    if a2 == np.argmax(self.q[s2]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a2:
        if action == np.argmax(self.q[s2]):
          G += (1-e+e/3) * self.q[s2][action]
        else:
          G += (e/3) * self.q[s2][action]
    G += r1
    if a1 == np.argmax(self.q[s1]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += (1-e+e/3) * self.q[s1][action]
        else:
          G += (e/3) * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])  

In [None]:
n_episodes = 1000
agent = TD3_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  total_G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  #for t in range(500):
  while True:
    obs, r2, done, _ = env.step(a2)
    total_G += r2
    s3 = tuple(agent.get_state(obs))
    a3 = agent.e_greedy(s3,e)
    agent.update(e,s,a,s1,a1,s2,a2,s3,r,r1,r2,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    r = r1
    r1 = r2

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD3_Tree | average score: {}".format(np.mean(test)))

TD3_Tree | average score: -122.6


In [None]:
class TD4_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,a2,s3,a3,s4,r,r1,r2,r3,lr):
    G = r3
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s4]):
        G += (1-e+e/3) * self.q[s4][action]
      else:
        G += (e/3) * self.q[s4][action]
    if a3 == np.argmax(self.q[s3]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a3:
        if action == np.argmax(self.q[s3]):
          G += (1-e+e/3) * self.q[s3][action]
        else:
          G += (e/3) * self.q[s3][action]
    G += r2
    if a2 == np.argmax(self.q[s2]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a2:
        if action == np.argmax(self.q[s2]):
          G += (1-e+e/3) * self.q[s2][action]
        else:
          G += (e/3) * self.q[s2][action]
    G += r1
    if a1 == np.argmax(self.q[s1]):
      G *= (1-e+e/3)
    else:
      G *= e/3
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += (1-e+e/3) * self.q[s1][action]
        else:
          G += (e/3) * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])  

In [None]:
n_episodes = 1000
agent = TD4_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  total_G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  obs, r2, done, _ = env.step(a2)
  total_G += r2
  s3 = tuple(agent.get_state(obs))
  a3 = agent.e_greedy(s3,e)
  #for t in range(500):
  while True:
    obs, r3, done, _ = env.step(a3)
    total_G += r3
    s4 = tuple(agent.get_state(obs))
    a4 = agent.e_greedy(s4,e)
    agent.update(e,s,a,s1,a1,s2,a2,s3,a3,s4,r,r1,r2,r3,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    s3,a3 = s4,a4
    r = r1
    r1 = r2
    r2 = r3


Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [None]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD4_Tree | average score: {}".format(np.mean(test)))

TD4_Tree | average score: -148.5


In [4]:
class off_TD2_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,r,r1,lr):
    G = r1
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s2]):
        G += 1 * self.q[s2][action]
      else:
        G += 0 * self.q[s2][action]
    if a1 == np.argmax(self.q[s1]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += 1 * self.q[s1][action]
        else:
          G += 0 * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state]) 

In [5]:
n_episodes = 1000
agent = off_TD2_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  #for t in range(500):
  while True:
    obs, r1, done, _ = env.step(a1)
    total_G += r1
    s2 = tuple(agent.get_state(obs))
    a2 = agent.e_greedy(s2,e)
    agent.update(e,s,a,s1,a1,s2,r,r1,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    r = r1

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [6]:
class off_TD3_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,a2,s3,r,r1,r2,lr):
    G = r2
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s3]):
        G += 1 * self.q[s3][action]
      else:
        G += 0 * self.q[s3][action]
    if a2 == np.argmax(self.q[s2]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a2:
        if action == np.argmax(self.q[s2]):
          G += 1 * self.q[s2][action]
        else:
          G += 0 * self.q[s2][action]
    G += r1
    if a1 == np.argmax(self.q[s1]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += 1 * self.q[s1][action]
        else:
          G += 0 * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])  

In [7]:
n_episodes = 1000
agent = off_TD3_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  total_G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  #for t in range(500):
  while True:
    obs, r2, done, _ = env.step(a2)
    total_G += r2
    s3 = tuple(agent.get_state(obs))
    a3 = agent.e_greedy(s3,e)
    agent.update(e,s,a,s1,a1,s2,a2,s3,r,r1,r2,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    r = r1
    r1 = r2

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [11]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD3_Tree | average score: {}".format(np.mean(test)))

TD3_Tree | average score: -138.9


In [12]:
class off_TD4_Tree():
  def __init__(self):
    self.q = np.zeros((10,10,3))
    self.maxstep = 500
    self.lr = np.linspace(1.0, 0.02, 1000)
    self.e = 0.1
    self.gamma = 1
    self.upper_bound = [env.observation_space.high[0],env.observation_space.high[1]]
    self.lower_bound = [env.observation_space.low[0],env.observation_space.low[1]]
    self.cluster = [10,10]
    self.history = []

  def get_state(self,obs):
    ratio = [(obs[i] - self.lower_bound[i]) / (self.upper_bound[i] - self.lower_bound[i]) for i in range(len(obs))]
    state = [int(round((self.cluster[i] - 1)*ratio[i])) for i in range(len(obs))]
    state = [min(self.cluster[i] - 1, max(0, state[i])) for i in range(len(obs))]
    return state

  def Decay(self, episodas):
    return self.lr[episodas], self.e * np.power(0.9, (episodas/100))

  def e_greedy(self, state, e):
    if np.random.random() < e:
      action = env.action_space.sample()
    else:
      action = np.argmax(self.q[state])
    return action

  def update(self,e,s,a,s1,a1,s2,a2,s3,a3,s4,r,r1,r2,r3,lr):
    G = r3
    for action in range(env.action_space.n):
      if action == np.argmax(self.q[s4]):
        G += 1 * self.q[s4][action]
      else:
        G += 0 * self.q[s4][action]
    if a3 == np.argmax(self.q[s3]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a3:
        if action == np.argmax(self.q[s3]):
          G += 1 * self.q[s3][action]
        else:
          G += 0 * self.q[s3][action]
    G += r2
    if a2 == np.argmax(self.q[s2]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a2:
        if action == np.argmax(self.q[s2]):
          G += 1 * self.q[s2][action]
        else:
          G += 0 * self.q[s2][action]
    G += r1
    if a1 == np.argmax(self.q[s1]):
      G *= 1
    else:
      G *= 0
    for action in range(env.action_space.n):
      if action != a1:
        if action == np.argmax(self.q[s1]):
          G += 1 * self.q[s1][action]
        else:
          G += 0 * self.q[s1][action]
    self.q[s][a] += lr * (r + G - self.q[s][a])

  def act(self, state):
    return np.argmax(self.q[state])  

In [13]:
n_episodes = 1000
agent = off_TD4_Tree()
for episodas in range(n_episodes):
  obs = env.reset()
  lr,e = agent.Decay(episodas=episodas)
  s = tuple(agent.get_state(obs))
  a = agent.e_greedy(s,e)
  obs, r, done, _ = env.step(a)
  total_G = r
  s1 = tuple(agent.get_state(obs))
  a1 = agent.e_greedy(s1,e)
  obs, r1, done, _ = env.step(a1)
  total_G += r1
  s2 = tuple(agent.get_state(obs))
  a2 = agent.e_greedy(s2,e)
  obs, r2, done, _ = env.step(a2)
  total_G += r2
  s3 = tuple(agent.get_state(obs))
  a3 = agent.e_greedy(s3,e)
  #for t in range(500):
  while True:
    obs, r3, done, _ = env.step(a3)
    total_G += r3
    s4 = tuple(agent.get_state(obs))
    a4 = agent.e_greedy(s4,e)
    agent.update(e,s,a,s1,a1,s2,a2,s3,a3,s4,r,r1,r2,r3,lr)
    if done:
      agent.history.append(total_G)
      #agent.policy_inprovement()
      print("Episoda: {} | Final return: {}".format(episodas,total_G))
      break
    s,a = s1,a1
    s1,a1 = s2,a2
    s2,a2 = s3,a3
    s3,a3 = s4,a4
    r = r1
    r1 = r2
    r2 = r3

Episoda: 0 | Final return: -200.0
Episoda: 1 | Final return: -200.0
Episoda: 2 | Final return: -200.0
Episoda: 3 | Final return: -200.0
Episoda: 4 | Final return: -200.0
Episoda: 5 | Final return: -200.0
Episoda: 6 | Final return: -200.0
Episoda: 7 | Final return: -200.0
Episoda: 8 | Final return: -200.0
Episoda: 9 | Final return: -200.0
Episoda: 10 | Final return: -200.0
Episoda: 11 | Final return: -200.0
Episoda: 12 | Final return: -200.0
Episoda: 13 | Final return: -200.0
Episoda: 14 | Final return: -200.0
Episoda: 15 | Final return: -200.0
Episoda: 16 | Final return: -200.0
Episoda: 17 | Final return: -200.0
Episoda: 18 | Final return: -200.0
Episoda: 19 | Final return: -200.0
Episoda: 20 | Final return: -200.0
Episoda: 21 | Final return: -200.0
Episoda: 22 | Final return: -200.0
Episoda: 23 | Final return: -200.0
Episoda: 24 | Final return: -200.0
Episoda: 25 | Final return: -200.0
Episoda: 26 | Final return: -200.0
Episoda: 27 | Final return: -200.0
Episoda: 28 | Final return: -2

In [14]:
test = []
for i in range(10):
  obs = env.reset()
  s = tuple(agent.get_state(obs))
  a = agent.act(s)
  G = 0
  while True:
    obs, r, done, _ = env.step(a)
    G += r
    s1 = tuple(agent.get_state(obs))
    a = agent.act(s1)
    if done:
      test.append(G)
      break
print("TD4_Tree | average score: {}".format(np.mean(test)))

TD4_Tree | average score: -126.2
