In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import copy 

import time

## Make Game Envoirment

In [2]:
env = gym.make("MountainCar-v0")

## Essential Functions

In [23]:
def normalize(state):
    state = 2*(state - np.array([-0.3, 0]))/length 
    return np.append(state, state[0]*state[1])

def get_features(state, centroids, sigma):
    state = normalize(state)
    return np.exp(-np.sqrt(np.sum(np.power(state - centroids, 2), axis=-1)) / (2*sigma)).reshape((dim_size**3, 1))

def get_action(features, weights, epsilon):
    if np.random.uniform() < epsilon:
        return np.random.randint(3)
    return np.argmax(get_Value(features, weights))

def get_Value(features, weights):
    return (weights @ features)

## Train

In [None]:
length = env.observation_space.high - env.observation_space.low 

In [8]:
def RBF_SARSA_Linear(render = False, episodes = 100, dim_size = 10, gamma = 1, sigma = 0.15, epsilon = 0.5, decaying_rate = 0.997, alpha = 0.01):
    centroids = np.random.uniform(-1,1, size = (dim_size, dim_size, dim_size, 3))
    weights = np.zeros((3, dim_size**3))
    total_rewards = []
    ep = epsilon
    for i in range(episodes):
        state = env.reset()
        done = False
        action = np.random.randint(3)
        rewards = 0
        ep *= decaying_rate
        while True:
            if (render and i%25 == 0):
                env.render()
            next_state, reward, done, info = env.step(action)
            #print(next_state, reward, done, info)
            features = get_features(state, centroids, sigma)
            Q = get_Value(features, weights)[action]
            if done:
                weights[action] += alpha*(reward - Q)*features.flatten()
                break
            next_action = get_action(features, weights, ep)
            next_features = get_features(next_state, centroids, sigma)
            next_Q = get_Value(next_features, weights)[next_action]
            weights[action] += alpha*(reward + gamma*next_Q - Q)*features.flatten()
            action = next_action
            state = next_state
            #if reward != -1:
                #print(reward, " episode ", i, " state ", state)
            rewards += reward
        total_rewards.append(rewards)
    env.close()
    return weights, centroids, total_rewards

In [9]:
dim_size = 10
w,c, total_rewards = RBF_SARSA_Linear(render = False, episodes = 1000, dim_size = dim_size, gamma = 1, sigma = 0.17,epsilon = 0.3, decaying_rate = 0.999)
np.mean(total_rewards)

-155.064

## Play

In [31]:
s = env.reset()
env.render()


True

In [32]:
s = env.reset()

for _ in range(1000):
    env.render()
    f = get_features(s, c, 0.35)
    a = np.argmax(w @ f)
    s, r, done, info = env.step(a) 
#     if done:
#         break
    time.sleep(0.01)
env.close()