In [1]:
import numpy as np

In [4]:
# Lab 1-1
class EpsilonGreedyAgent:
  def __init__(self, num_actions, epsilon = 0.1):
    self.num_actions = num_actions
    self.epsilon = epsilon
    self.action_values = np.zeros(num_actions)
    self.action_counts = np.zeros(num_actions)

  def select_action(self):
    if np.random.rand() < self.epsilon:
      action = np.random.randint(self.num_actions)
    else:
      action = np.argmax(self.action_values)
    return action

  def update_value(self, action, reward):
    self.action_counts[action] += 1
    self.action_values[action] += (1/self.action_counts[action]) * (reward - self.action_values[action])

class MultiArmedBandit:
  def __init__(self, num_arms):
    self.num_arms = num_arms
    self.true_action_values = np.random.normal(0, 1, num_arms)

  def get_reward(self, action):
    return np.random.normal(self.true_action_values[action], 1)

num_arms = 10
num_steps = 1000
agent = EpsilonGreedyAgent(num_arms)
bandit = MultiArmedBandit(num_arms)
total_rewards = 0
for step in range(num_steps):
  action = agent.select_action()
  reward = bandit.get_reward(action)
  agent.update_value(action, reward)
  total_rewards += reward

print('Total rewards obtained: ', total_rewards)
print('Estimated action values: ', agent.action_values)

Total rewards obtained:  1246.5683305227785
Estimated action values:  [ 0.21277904  0.21784347 -0.74642682 -0.49637256 -0.69671679 -2.60967309
 -0.21101     1.69497297 -0.80593511  1.30264879]


In [8]:
# Lab 1-2
class GridWorld:
  def __init__(self):
    self.grid_size = (3, 3)
    self.num_actions = 4
    self.rewards = np.array([
        [0, 0, 0],
        [0, 0, 0],
        [0, 1, 0]
    ])

  def get_reward(self, state):
    return self.rewards[state[0], state[1]]

class ValueFunction:
  def __init__(self, grid_size):
    self.values = np.zeros(grid_size)

  def update_value(self, state, new_value):
    self.values[state[0], state[1]] = new_value

  def get_value(self, state):
    return self.values[state[0], state[1]]

grid_world = GridWorld()
value_function = ValueFunction(grid_world.grid_size)
for i in range(grid_world.grid_size[0]):
  for j in range(grid_world.grid_size[1]):
    state = (i, j)
    value_function.update_value(state, grid_world.get_reward(state))

print('Initial Value Function:')
print(value_function.values)

Initial Value Function:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 1. 0.]]
