In [18]:
import gymnasium as gym
import numpy as np
import torch
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from tqdm import tqdm
import torch.nn as nn
from itertools import count
import imageio


import torch as T
from torch import optim
import torch.nn.functional as F
from collections import deque , namedtuple
from itertools import count
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

https://www.gymlibrary.dev/environments/box2d/lunar_lander/

## Planning:
The goal is to build a simple policy gradient algorithm (AKA REINFORCE), as a way of getting more comfortable with this approach. This kind of technique is used frequently in other architectures such as PPO, DDPG and a2c.

## approach

My approach here is to first create a simple policy network which can take in states as input and output a distribution over the possible actions.

I will then set up an agent which uses the policy network to make decisions, and can train that policy by calculating loss based on the reward from the environment.

I will then setup a training loop where the network can take actions and update its policy.

After



TODO:
- [*] Create basic network that represents our policy.
- [*] Create an agent that chooses an action based on what it knows about the environment
- [*] Add functions to the agent allowing it to calculate loss based on its action and update its policy
- [*] Build a training loop and have our agent train.
- [] Train and produce graphs


In [19]:
# print gym environment information
env = gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5
    # render_mode="rgb_array"
)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

observation space: Box([-90.        -90.         -5.         -5.         -3.1415927  -5.
  -0.         -0.       ], [90.        90.         5.         5.         3.1415927  5.
  1.         1.       ], (8,), float32)
action space: Discrete(4)


In [None]:
class FCN(nn.Module):
    def __init__(self, n_observations, n_actions, fc1_dims=256, fc2_dims=256):
        super(FCN, self).__init__()
        self.layer1 = nn.Linear(n_observations, fc1_dims)
        self.layer2 = nn.Linear(fc1_dims, fc2_dims)
        self.layer3 = nn.Linear(fc2_dims, n_actions)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        logits = self.layer3(x)
        return F.softmax(logits, dim=-1)


# Produces distribution of actions

In [None]:
network = FCN(env.observation_space.shape[0], env.action_space.n)
state, info = env.reset()
state = torch.Tensor(state)
network(state)

tensor([0.2369, 0.2524, 0.2708, 0.2399], grad_fn=<SoftmaxBackward0>)

In [None]:
class agent():
    def __init__(self, env, lr):
        self.network = FCN(env.observation_space.shape[0], env.action_space.n)
        self.optimizer = torch.optim.Adam(self.network.parameters(), lr=lr)
        self.rewards = None
        self.actions = None
        self.log_probs = None
        self.env = env
    
    def get_distribution(self, state):
        distribution = self.network(state)
        distribution = torch.distributions.Categorical(distribution)
        return distribution

    def train(self,  gamma, num_episodes):

        for episode in range(num_episodes):
            self.rewards = []
            self.log_probs = []
            state, info = env.reset()
            terminated = False


            while not terminated:
                state = torch.from_numpy(state).float()
            
                distribution = self.get_distribution(state)
                action = distribution.sample()
                log_prob = distribution.log_prob(action)
                self.log_probs.append(log_prob)

                state, rewards, terminated, _, _ = env.step(action.item())
                
                self.rewards.append(rewards)
            
            self.update_policy(gamma)
            if episode % 50 == 0:
                print(f"Completed episode {episode} and achieved score {sum(self.rewards)}")
    
    def update_policy(self, gamma):
        total_future_rewards = []
        total_future_reward = 0

        for reward in self.rewards:
            total_future_reward = total_future_reward * gamma + reward
            total_future_rewards.append(total_future_reward)
        
        returns = total_future_rewards[::-1]
        returns = torch.tensor(returns)
        returns = (returns - returns.mean()) / (returns.std() + 1e-9)

        policy_loss = []
        for log_prob, total_future_reward in zip(self.log_probs, returns):
            loss = -log_prob * total_future_reward
            loss = loss.unsqueeze(0)
            policy_loss.append(loss)
        
        self.optimizer.zero_grad()
        policy_loss = torch.cat(policy_loss).sum()
        policy_loss.backward()
        self.optimizer.step()

        


In [None]:
# print gym environment information
env = gym.make(
    "CartPole-v1"
    # continuous = False,
    # gravity = -10.0,
    # enable_wind = False,
    # wind_power = 15.0,
    # turbulence_power = 1.5
    # render_mode="rgb_array"
)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

guy = agent(env, lr=0.0001)

# agent.run_episode_and_save_gif(env)
guy.train( gamma=0.99, num_episodes=3000)

observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
action space: Discrete(2)
Completed episode 0 and achieved score 23.0
Completed episode 50 and achieved score 56.0
Completed episode 100 and achieved score 56.0
Completed episode 150 and achieved score 40.0
Completed episode 200 and achieved score 115.0
Completed episode 250 and achieved score 94.0
Completed episode 300 and achieved score 48.0
Completed episode 350 and achieved score 194.0
Completed episode 400 and achieved score 372.0
Completed episode 450 and achieved score 168.0
Completed episode 500 and achieved score 108.0
Completed episode 550 and achieved score 123.0
Completed episode 600 and achieved score 376.0
Completed episode 650 and achieved score 191.0
Completed episode 700 and achieved score 348.0
Completed episode 750 and achieved score 193.0
Completed episode 800 and achieved score 412.0
Completed episode 850 an

KeyboardInterrupt: 

### Convergence and great success with cartpole
We note here that REINFORCE does converge with cartpole, and not only that but massively outperforms what we could achieve with DQN. 

I speculate that this may be because REINFORCE is able to more precisely model the correct actions within the narrow range available to cartpole.

In [None]:
# print gym environment information
env = gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5
    # render_mode="rgb_array"
)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

guy = agent(env, lr=0.00003)

# agent.run_episode_and_save_gif(env)
guy.train( gamma=0.99, num_episodes=3000)

observation space: Box([-90.        -90.         -5.         -5.         -3.1415927  -5.
  -0.         -0.       ], [90.        90.         5.         5.         3.1415927  5.
  1.         1.       ], (8,), float32)
action space: Discrete(4)
Completed episode 0 and achieved score -288.1393010147376
Completed episode 50 and achieved score -121.08381794907808
Completed episode 100 and achieved score -93.2132013565292
Completed episode 150 and achieved score -124.9879530865401
Completed episode 200 and achieved score -159.7268319041226
Completed episode 250 and achieved score -189.42891953151076
Completed episode 300 and achieved score -119.91641968438985
Completed episode 350 and achieved score -114.7540347145775
Completed episode 400 and achieved score -119.1686646292804
Completed episode 450 and achieved score -65.50800505743473
Completed episode 500 and achieved score -278.30043274229365
Completed episode 550 and achieved score -10.612656103722045
Completed episode 600 and achieved sc

KeyboardInterrupt: 

### Convergence fails on lunar lander
We can see that the REINFORCE algorithm with the parameters struggles to converge for lunar lander.
This could be because the observation space is too large for this algorithm to converge. 

It could also be the adjustments we are making to our network are too large and cause the training to be too unstable. We do see better performance when lower learning rates are applied.

An experiment I would want to try would be to test the continuous version of this environment and see if REINFORCE does better there, due to the nature of its policy adjustments. 

For now however I want to move on to PPO.