## Installation

In [None]:
%pip install gym==0.21.0
%pip install numpy
%pip install matplotlib

## Environement Setup

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt

In [None]:
env = gym.make('MountainCar-v0')
Qpts = None

# Training

In [None]:
def rewards_calcul(pos):
    if (pos >= 0.5):
        return 2
    else:
        return (pos + 1.2) / 1.8 - 1

In [None]:
def stateSpace(env):
    size_states = (env.observation_space.high - env.observation_space.low) * np.array([10, 50])
    size_states = np.around(size_states, 0).astype(int) + 1
    return size_states

In [None]:
def lower_state(env):
    state = env.reset()
    low_state = (state - env.observation_space.low) * np.array([10, 50])
    low_state = np.round(low_state, 0).astype(int)
    return low_state

In [None]:
def update(learningRate, nextState, nextState_adj, Q_Table, currentState, action):
    delta = learningRate * (rewards_calcul(nextState[0]) + np.max(Q_Table[nextState_adj[0], nextState_adj[1]]) - Q_Table[currentState[0], currentState[1], action])
    return delta

In [None]:
def training(env, learningrate, epsilon, min_epsilon, episodes):

    reward_list = []
    average_rewards = []    
    eps1 = epsilon
    first = episodes + 1
    size_states = stateSpace(env)
    
    Q_Table = np.random.uniform(low = -1, high = 0, size = (size_states[0], size_states[1], env.action_space.n))    
    Qinit = np.copy(Q_Table)
    
    for i in range(episodes):
        done = False
        tot_reward = 0
        reward = 0
        env.reset()
        
        state_adj = lower_state(env)
        
        while done != True:
            # Calculate next action
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q_Table[state_adj[0], state_adj[1]])
            else:
                action = np.random.randint(0, env.action_space.n)
                
            nextState, reward, done, _ = env.step(action)
            
            nextState_adj = (nextState - env.observation_space.low) * np.array([10, 50])
            nextState_adj = np.round(nextState_adj, 0).astype(int)
            
            row = np.array([state_adj[0], state_adj[1], action])
            
            # Allow for terminal state
            if done and nextState[0] >= 0.5:
                Q_Table[state_adj[0], state_adj[1], action] = reward
            # Adjust Q value for current state
            else:
                delta = update(learningrate, nextState, nextState_adj, Q_Table, state_adj, action)
                Q_Table[state_adj[0], state_adj[1], action] += delta
            
            # Say when first success occurs
            if nextState[0] >= 0.5 and i < first:
                first = i
                print('First time reaching goal on epsiode {}'.format(first + 1))
            
            tot_reward += rewards_calcul(nextState[0])
            state_adj = nextState_adj
        
        if epsilon > min_epsilon:
            epsilon *= eps1
            
        reward_list.append(tot_reward)
        
        if (i + 1) % 100 == 0:
            ave_reward = np.mean(reward_list)
            average_rewards.append(ave_reward)
            reward_list = []
            print('Episode {} Average Reward: {}'.format(i + 1, ave_reward))
        
    env.close()
    return average_rewards, Q_Table, Qinit
        

In [None]:
env.reset()
rewards, Qpts, Qinit = training(env, 0.2, 0.999, 0, 10000)
np.savetxt("data.csv", Qpts.reshape(-1, Qpts.shape[-1]), delimiter=",")

In [None]:
plt.plot(100 * (np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward per Episodes')

# Testing

In [None]:
def load_data():
    Q_Table_flat = np.loadtxt("data.csv", delimiter=",")
    size_states = stateSpace(env)
    Q_Table_shape = (size_states[0], size_states[1], env.action_space.n)
    Qpts = Q_Table_flat.reshape(*Q_Table_shape)
    return Qpts

In [None]:
def testing(env, Q_Table):
    if Q_Table == None:
        Q_Table = load_data()
    env.reset()
    state_adj = lower_state(env)
    done = False
    while not done:
        env.render()
        action = np.argmax(Q_Table[state_adj[0], state_adj[1]])
        next_state, reward, done, _ = env.step(action)
        state_adj = (next_state - env.observation_space.low) * np.array([10, 50])
        state_adj = np.round(state_adj, 0).astype(int)
    env.close()

In [None]:
env.reset()
testing(env, Qpts)