### Mountain Car RL Project

Aaron Collinsworth

## Random

In [None]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import random

from random_mc import random_mc

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()

max_steps = 10000000
num_to_avg = 50

num_steps = []
for i in range(num_to_avg):
    print("Episode {}".format(i))
    num_steps.append(random_mc(env, max_steps))

avg_num_steps = sum(num_steps)/len(num_steps)

print("Avg number of steps required {}".format(avg_num_steps))


The random mountain car uses random actions until it reaches goal state.

## Q-Learning

In [None]:
import numpy as np
import gymnasium as gym
import matplotlib.pyplot as plt
import mountain_car_func as mcf

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0', render_mode='rgb_array')
env.reset()

from gym.wrappers.monitoring.video_recorder import VideoRecorder
before_training = "last_episodes.mp4"
video = VideoRecorder(env, before_training)

# Define Q-learning function
def QLearning(env, learning_rate, discount_factor, epsilon, num_episodes, print_interval=100):
    
    # Determine size of discretized state space
    num_states = (env.observation_space.high - env.observation_space.low)*np.array([10, 100])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    Q = np.random.uniform(low = -1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    avg_rewards = []
    
    # Calculate amount to reduce epsilon by
    episode_decay = epsilon/num_episodes

    # Run Q learning algorithm
    for episode_num in range(num_episodes):
                
        Q, reward_list, epsilon = perform_episode(Q, episode_num, num_episodes, learning_rate, discount_factor,episode_decay, reward_list, avg_rewards, print_interval, video, epsilon)

    env.close()

    return avg_rewards

def perform_episode(Q_Table, episode_num, num_episodes, learning_rate, discount_factor,episode_decay, reward_list, avg_rewards, print_interval, video, epsilon):
    
    # Initialize parameters
    episode_complete = False
    total_reward = 0
    reward = 0

    # Reset Initial State
    state = env.reset()
    state_adj = mcf.discretize_state(env, state[0])

    while episode_complete != True:

        mcf.video_capture(env, video, num_episodes, episode_num)

        action = mcf.select_action(Q_Table, env, state_adj, epsilon)

        # Apply action to move to next state - obtain rewards
        new_state, reward, episode_complete, truncated, info = env.step(action) 

        # Discretize new_state
        new_state_adj = mcf.discretize_state(env, new_state)

        Q_Table = update_q_table(Q_Table, state_adj, new_state, action, new_state_adj, episode_complete, learning_rate, discount_factor, reward)

        # Update variables
        total_reward = total_reward + reward
        state_adj = new_state_adj

    # Update epsilon
    epsilon = epsilon - episode_decay
    
    # Track rewards
    reward_list.append(total_reward)

    mcf.print_avg_reward(episode_num, avg_rewards, reward_list, print_interval)

    return Q_Table, reward_list, epsilon

def update_q_table(Q_Table, state_adj, new_state, action, new_state_adj, episode_complete, learning_rate, discount_factor, reward):

    # Terminal States
    if episode_complete and new_state[0] >= 0.5:
        Q_Table[state_adj[0], state_adj[1], action] = reward

    # Update Q value for current state
    else:
        delta = learning_rate*(reward + discount_factor*np.max(Q_Table[new_state_adj[0], new_state_adj[1]]) - Q_Table[state_adj[0], state_adj[1],action])
        Q_Table[state_adj[0], state_adj[1], action] = Q_Table[state_adj[0], state_adj[1], action] + delta

    return Q_Table



### Parameter explaination:

learning rate - controls the amount of adjustments made from each episode. Lower learning rates mean that the model will adjust in smaller intervals.

discount factor - weight that controls how much future states impact the current state.

epsilon - controls the exploration vs exploitation balance by committing to more greedy searches as time goes on.

num_episodes - number of episodes to train.

In [None]:
learning_rate = 0.2
discount_factor = 0.9
epsilon = 0.8
num_episodes = 30

# Run Q-learning algorithm
rewards = QLearning(env, learning_rate, discount_factor, epsilon, num_episodes)

# Plot Rewards
plt.plot(100*(np.arange(len(rewards)) + 1), rewards)
plt.xlabel('Episodes')
plt.ylabel('Average Reward')
plt.title('Average Reward vs Episodes')
plt.savefig('rewards.jpg')     
plt.close()  

video.close()

Overall the q-learning approach is able to learn and find a working policy in less moves than the random approach.

Apologies for the lack of depth in explainations. I just wanted to prove I got the random portion and q-learning portion complete. I strugged with the deep q-learning part and ran out of time. Setting up the enviroment on windows was actually pretty hard and getting a visual inside a docker container was not working. I eventually was able to save it as a video. Please go easy on the grading lol.