In [7]:
import numpy as np
from matplotlib import pyplot as plt

In [8]:
class Agent:
    def choose_action(self,state):
        action = 0
        if np.random.uniform(0,1) < self.epsilon:
            action = self.action_space.sample()
        else:
            action = np.argmax(self.Q[state])
        return action
class DoubleLearningAgent:
    def choose_action(self,state):
        action = 0
        if np.random.uniform(0,1) < self.epsilon:
            action = self.action_space.sample()
        else:
            action = np.argmax(self.Q1[state]+self.Q2[state])

In [9]:
class Sarsa(Agent):
    def __init__(self,epsilon,alpha,gamma,Q,action_space):
        self.epsilon = epsilon
        self.alpha = alpha 
        self.gamma = gamma
        self.Q = Q
        self.action_space = action_space
    def update(self,prev_state,prev_action,reward,next_state,next_action):
        prediction = self.Q[prev_state + (prev_action,)]
        target = reward + self.gamma * self.Q[next_state+(next_action,)]
        error = target - prediction
        self.Q[prev_state+(prev_action,)] += self.alpha * error
class QLearner(Agent):
    def __init__(self,epsilon,alpha,gamma,Q,action_space):
        self.epsilon = epsilon
        self.alpha = alpha 
        self.gamma = gamma
        self.Q = Q
        self.action_space = action_space
    def update(self,prev_state,prev_action,reward,next_state,next_action):
        prediction = self.Q[prev_state+(prev_action,)]
        target = reward + self.gamma * np.amax(self.Q[next_state])
        error = target - prediction
        self.Q[prev_state+(prev_action,)] += self.alpha * error

In [10]:
import gym
import math
env = gym.make("MountainCar-v0")

env.observation_space

NUM_ACTIONS = env.action_space.n

MIN_EXPLORE_RATE = .01
MIN_LEARNING_RATE = .2
NUM_BUCKETS = (180,14)
NUM_EPISODES = 1000
MAX_STEPS = 100

def get_explore_rate(t):
    return max(MIN_EXPLORE_RATE, min(1, 1.0 - math.log10((t+1)/25)))

def get_learning_rate(t):
    return max(MIN_LEARNING_RATE, min(0.5, 1.0 - math.log10((t+1)/25)))

q = np.zeros(NUM_BUCKETS + (NUM_ACTIONS,))


def get_bucket(state):
    [pos,vel] = state
    pos_state_idx = int(round(pos+1.2,1)*100)
    vel_state_idx = int(round(vel+.07,2)*100)
    return tuple([pos_state_idx,vel_state_idx])


sarsaAgent = QLearner(1,1,1,q,env.action_space)

agents = [sarsaAgent]

for agent in agents:
    agent.alpha = get_learning_rate(0)
    agent.epsilon = get_explore_rate(0)
    agent.gamma = 0.999
    
    for episode in range(NUM_EPISODES):
        obv = env.reset()
        state = get_bucket(obv)
        done = False
        while not done:
            env.render()
            action = agent.choose_action(state)
            obv,reward,done,_ = env.step(action)
            next_state = get_bucket(obv)
            next_action = agent.choose_action(next_state)
            agent.update(state,action,reward,next_state,next_action)
            state = next_state
        
        agent.alpha = get_learning_rate(episode)
        agent.epsilon = get_explore_rate(episode)

env.close()
            
        