In [18]:
import gymnasium as gym
import numpy as np
import torch
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import math
import random
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from tqdm import tqdm
import torch.nn as nn

import torch as T
from torch import optim
import torch.nn.functional as F
from collections import deque , namedtuple
from itertools import count
# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()
# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
# print gym environment information
env = gym.make(
    "LunarLander-v2",
    continuous = False,
    gravity = -10.0,
    enable_wind = False,
    wind_power = 15.0,
    turbulence_power = 1.5,
    render_mode="rgb_array"
)
print('observation space:', env.observation_space)
print('action space:', env.action_space)


observation space: Box([-1.5       -1.5       -5.        -5.        -3.1415927 -5.
 -0.        -0.       ], [1.5       1.5       5.        5.        3.1415927 5.        1.
 1.       ], (8,), float32)
action space: Discrete(4)


https://www.gymlibrary.dev/environments/box2d/lunar_lander/

## Planning:
The goal is to use DQN to solve the lunar lander problem.

Currently I am unclear on the following:
Exactly how to set up the network to receive inputs and outputs from the system
What kind of loss function should be used in this setup to make sure that it is learning correctly.
How to account for the various kinds of things can be observed in the observation space.

## approach
A good first step would be to try and experiment with basic approaches messing around with the system to get a grasp of it.
Then I'd like to experiment with a basic neural network feeding in maybe 2 of the parameters.

Edit: I think I will try to use the 4 parameters as inputs to the network and see how that goes.
Playing with the network structure and the loss function should be the next step.
Messing around trying to solve it by hand will probably not be very useful.



TODO:
- [] Create agent class that can interact with the environment
- [] Add the neural network to the actions of the agent
- [] Add the loss function to the agent
- [] Add the replay buffer to the agent
- [] Add the epsilon greedy policy to the agent
- [] Add the training loop to the agent

In [19]:
class network(nn.Module):
    def __init__(self, n_observations, fc1_dims=128, fc2_dims=128, n_actions=4):
        super(network, self).__init__()

        self.fc1 = nn.Linear(n_observations, fc1_dims)
        self.fc2 = nn.Linear(fc1_dims, fc2_dims)
        self.pi = nn.Linear(fc2_dims, n_actions)


 
        self.to(self.device)
    
    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        pi = T.softmax(self.pi(x), dim=1)

        return pi
    


In [20]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class replay_memory(object):
    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [21]:
class Agent():
    def __init__(self) -> None:
        actor_dims = env.observation_space.shape[0]
        n_actions = env.action_space.n
        self.policy_network =  network(input_dims=actor_dims, n_actions=n_actions)
        self.target_network = network(input_dims=actor_dims, n_actions=n_actions)
        self.target_network.load_state_dict(self.policy_network.state_dict())
        self.memory = replay_memory(100000)
        self.steps_done = 0
        self.batch_size = 128

        self.gamma = 0.99
        self.eps_start = 0.9
        self.eps_end = 0.05
        self.eps_decay = 1000
        self.tau = 0.005
        self.LR = 0.001

        self.max_score = -math.inf
        self.min_score = math.inf
        self.avg_score = 0

        self.optimizer = optim.Adam(self.network.parameters(), lr=self.LR, amsgrad=True)
        
        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.chkpt_file = 'tmp/lunar_lander'


    
    def save_checkpoint(self):
        T.save(self.network.state_dict(), self.chkpt_file)

    def load_checkpoint(self):
        self.load_state_dict(T.load(self.chkpt_file))
        
    
    def select_action(self, state):
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ 
            math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        state = T.tensor([observation], dtype=T.float).to(self.network.device)
        probabilities = self.network.forward(state)
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.memory.append((log_probs))

        return action.item()
    


In [15]:
agent

network(
  (fc1): Linear(in_features=8, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=64, bias=True)
  (pi): Linear(in_features=64, out_features=4, bias=True)
)

In [35]:

# reset environment
state = env.reset()
done = False
images = []
count = 0
total_reward = 0
while not done:
    count +=1
    # render environment
    image = env.render()
    images.append(image)
    # sample random action
    action = env.action_space.sample()
    # take action
    observation, state, reward, done, info = env.step(action)
    state = T.tensor([observation], dtype=T.float).to(self.actor.device)
    result = agent.forward(T.tensor(state).float())

    total_reward += reward

RuntimeError: both arguments to matmul need to be at least 1D, but they are 0D and 2D