# Deep Q-Learning for Lunar Landing

### Importing the libraries

In [2]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

## Part 1 - Building the AI

In [3]:
# Our obervation space is based on an 8d vecter space
# Coordinates of lander x, y
# velocity in 2d x, y
# its angle, its angular velocity and two booleans to represent
# whether each leg is in contact with the ground or not.

class Brain(nn.Module):
    def __init__(self, state_size, action_size, seed=42) -> None:
        super(Brain).__init__()
        self.seed = torch.manual_seed(seed)
        #input layer = state_size
        #1st layer of neurons was found to be 64 through a large number of experimentation
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64) #second fully connected layer also has 64 neurons through experimentation
        self.fc3 = nn.Linear(64, action_size) #action_size is 4
        #Brain of AI is completed, this is a powerful neural network built

    def forward(self, state):
        x = self.fc1(state)
        x = F.relu(x)
        x = self.fc2(x)
        x = F.relu(x)
        return self.fc3(x)

    #Neural Network Built

### Creating the architecture of the Neural Network

## Part 2 - Training the AI

### Setting up the environment

In [None]:
import gymnasium as gym
env = gym.make('LunarLander-v2')
state_shape = env.observation_space.shape
state_size = env.observation_space.shape[0]
number_actions = env.action_space.n
print("State shape: ", state_shape)
print("state size: ", state_size)
print('Number of actions: ', number_actions)

### Initializing the hyperparameters

In [5]:
learning_rate = 5e-4 #lr determined by experimentation
minibatch_size = 100
 # this is the gamma, if it is 0 or close to zero, it will make the agent
 # Shortsighted and only look at closer rewards to generate results and vice
 # versa with 1 is true
discount_factor = 0.99
replay_buffer_size = int(1e5)
#interpolation factor or tao determined from experimentation
tao = 1e-3

  and should_run_async(code)


### Implementing Experience Replay

In [6]:
class ReplayMemory(object):

  def __init__(self, capacity):
      # to increase speed of execution, if we have a good gpu, the program will use that to
      # execute, else it will use the cpu
      self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
      self.capacity = capacity
      self.memory = []

  def push(self, event):
      self.memory.append(event)
      if len(self.memory) > self.capacity:
        del self.memory[0]

  def sample(self, batch_size):
    experiences = random.sample(self.memory, k = batch_size)
    # from a numpy array, a pytorch tensor is created
    states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
    rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
    next_state = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
    # uint8 is used for bool type values
    dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
    return states,next_state, actions, rewards, dones


  and should_run_async(code)


### Implementing the DQN class

In [None]:
class Agent():

  def __init__(self, state_size, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.state_size = state_size
    self.action_size = action_size
    #using q learning, we create 2 q learning networks: local and target
    self.local_qnetwork = Brain(state_size, action_size).to(self.device)
    self.target_qnetwork = Brain(state_size, action_size).to(self.device)
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
    self.memory = ReplayMemory(replay_buffer_size)
    # at which moment we learn and update the network parameters
    self.t_step = 0

    def step(self, state, action, reward, next_state, dones):
      self.memory.push((state, action, reward, next_state, dones))
      self.t_step = (self.t_step + 1) % 4
      if(self.t_step == 0):
        if len(self.memory.memory) > minibatch_size:
          experiences = self.memory.sample(100)
          self.learn(experiences, discount_factor)

    def act(self, )

### Initializing the DQN agent

### Training the DQN agent

## Part 3 - Visualizing the results

In [None]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v2')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()