# Deep Q-Learning for Lunar Landing

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [1]:
# This will install gymnasium ka attari environment
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  swig4.0
Suggested packages:
  swig-doc swig-examples swig4.0-examples swig4.0-doc
The following NEW packages will be installed:
  swig swig4.0
0 upgraded, 2 newly installed, 0 to remove and 35 not upgraded.
Need to get 1,116 kB of archives.
After this operation, 5,542 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig4.0 amd64 4.0.2-1ubuntu1 [1,110 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 swig all 4.0.2-1ubuntu1 [5,632 B]
Fetched 1,116 kB in 1s (1,688 kB/s)
Selecting previously unselected package swig4.0.
(Reading database ... 126371 files and directories currently installed.)
Preparing to unpack .../swig4.0_4.0.2-1ubuntu1_amd64.deb ...
Unpacking swig4.0 (4.0.2-1ubuntu1) ...
Selecting previously unselected package swig.
Preparing to unpack .../swig_4.0.2-1ubu

### Importing the libraries

In [2]:
# this is for operating systems
import os
# this is for generating random numbers
import random
import numpy as np
import torch
# nn is the neural Network module from the torch torch.library
import torch.nn as nn
# optim is optimiser module from the torch torch.library
import torch.optim as optim
import torch.nn.functional as F
# autograd is used for stochastic gradient descent from the torch torch.library
import torch.autograd as autograd
from torch.autograd import Variable
from collections import deque, namedtuple

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [3]:
# we create a class which is the brain of the ai and we inherit a nn(neural network module) from which, we are
# going to call a class named Module(from which, we will inherit all the features of the class)
class Network(nn.Module):
  # state_size is the number of imputs in state(8-imentional in this case of lunar landing)
  # action_size is the number of actions that can be taken(4: no,left,right,same_direction)
  # seed is used for randomness(set as 42 as of now)
  def __init__(self, state_size, action_size,seed=42):
    super(Network,self).__init__()
    self.seed=torch.manual_seed(seed)
    # now, we will be building connections b/w input layer and fully connected layer
    self.fc1 = nn.Linear(state_size, 64)
    self.fc2 = nn.Linear(64, 64)
    # action_size is the number of neurons in the output layer and jo phle likha hai, vo input layer ka hai
    self.fc3 = nn.Linear(64, action_size)

  # now next function will fully propagate the signal from the input layer to the output layer
  def forward(self, state):
    x=self.fc1(state)
    # update the x by rectifier activation function(relu)(to activate the signal).
    # F is the shortcut we created while importing libraries(import torch.nn.functional as F)
    # F.relu(x) applies the ReLU activation function:
    # ReLU(x)=max(0,x)
    # This introduces non-linearity, so the model can learn more complex relationships.
    x = F.relu(x)
    # bracket ke andar x rakha hai, as it will take input from the first fully connected layer.
    x = self.fc2(x)
    x = F.relu(x)
    return self.fc3(x)

## Part 2 - Training the AI

### Setting up the environment

In [4]:
import gymnasium as gym
env = gym.make('LunarLander-v3')#put the name of the environment('LunarLander-v3')
# Creates an instance of the Lunar Lander environment (discrete-action version).
# env is now your simulator: you can reset(), step(), and render() it.
state_shape = env.observation_space.shape
state_size= env.observation_space.shape[0]# take 1st index
number_actions=env.action_space.n
print('State shape: ', state_shape)#will give a vector of 8 elements
print('State size: ', state_size)#8
print('Number of actions: ', number_actions)#4(no,left,right,same_direction)

State shape:  (8,)
State size:  8
Number of actions:  4


### Initializing the hyperparameters

In [5]:
learning_rate=5e-4 #this value is choosen after a lot of experimentation
minibatch_size = 100 #the number of observations used in one step of the training to update model parameters
discount_factor = 0.99 #also called gamma(close to 1 as, it will consider the future rewards in its accumulation of total reward) and we will gwt best results if we consider future rewards.
replay_buffer_size = int(1e5) #the memory size of the ai(1e5 is 100,000)
interpolation_parameter = 1e-3 #also called tau(1e-3 is 10^-3 or 0.001)

### Implementing Experience Replay

In [6]:
#This class will implement experience replay and object means that we are inheriting nothing in this class
class ReplayMemory(object):
  def __init__(self,capacity):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # cuda:0 is for checking ki cpu present hai ya nhi. This line is basically to make code run faster through GPU
    self.capacity = capacity # by writing capacity, we initialise it to max. size of the memory buffer
    self.memory = []

  def push(self, event):
    # to add the events in the memory list
    self.memory.append(event)
    if len(self.memory) > self.capacity:
      # if size exceeds, delete the oldest event(0th index wala event)
      del self.memory[0]

   # sample will randomly select experiences from the memory buffer
  def sample(self, batch_size):
    experiences = random.sample(self.memory, k = batch_size)
    # now we will one by one extract all the elements of the sampled experience and stack them one over the other
    # we use vstack which will stack the sampled experiences together
    # then we use one liner for loop inside
    # but only[np.vstack([e[0] for e in experiences if e is not None])]is not enought, we will convert them to pytorch tensors as: torch.from_numpy
    # and to(self.device) is is to maintain it in cpu or gpu
    states = torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device)
    actions = torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)
    rewards = torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device)
    next_states = torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device)
    # astype() is to convert the values into a diff datatype and np.uint8 will convert it into bool before setting it into float
    dones = torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
    return states, next_states, actions, rewards, dones

### Implementing the DQN class

In [7]:
# we created an agent class which interacts with the space env. using a deep q network
class Agent():

  def __init__(self, state_size, action_size):
    self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.state_size = state_size
    self.action_size = action_size
    # thsi will select the actions
    self.local_qnetwork = Network(state_size, action_size).to(self.device)
    # this will calculate the target values
    self.target_qnetwork = Network(state_size, action_size).to(self.device)
    self.optimizer = optim.Adam(self.local_qnetwork.parameters(), lr = learning_rate)
    self.memory = ReplayMemory(replay_buffer_size)
    # t_step is the time step which means in which moment, we are going to learn and update parameters
    self.t_step = 0

  def step(self, state, action, reward, next_state, done):
    # store the experiences in the replay memory
    self.memory.push((state, action, reward, next_state, done))
    # modulo with 4 as, we want model to learn every 4 steps. So, increment and reset every 4 steps
    self.t_step = (self.t_step + 1) % 4
    # to check whether self.t_step reached 0 or has reset, we do this
    if self.t_step == 0:
      if len(self.memory.memory) > minibatch_size:
        # we set this (minibatch_size) 100 above
        experiences = self.memory.sample(100)
        self.learn(experiences, discount_factor)

# this act will select an action based on the given state for a certain epsilon in the environment
# we are taking here, epsilon greedy action selection process
  def act(self, state, epsilon = 0.):
    # covert in tensor of torch
    # In PyTorch, .unsqueeze(dim) is used to add a new dimension at the position dim.
    state = torch.from_numpy(state).float().unsqueeze(0).to(self.device)
    # to set our local q network in eval mode
    self.local_qnetwork.eval()
    # to check that we are not in training mode but in inference mode and no_grad() ensures that any gardient computation is disabled
    with torch.no_grad():
      action_values = self.local_qnetwork(state)
      # train() ensures that we come back to training mode
    self.local_qnetwork.train()
    if random.random() > epsilon:
      # this will select the action with highest q value
      return np.argmax(action_values.cpu().data.numpy())
    else:
      # if lesser, then we will return a random action
      return random.choice(np.arange(self.action_size))

# this method will update the agent's q-value based on the sample experiences
  def learn(self, experiences, discount_factor):
    # initialised or unpacked the variables
    # experiences here is typically a tuple of tensors/arrays, containing batched data.
    # Suppose replay buffer returned this batch of 2 experiences:
#     experiences = (
#   [[1,2,3], [4,5,6]],   # states
#   [[2,3,4], [5,6,7]],   # next_states
#   [0, 1],               # actions
#   [10, -5],             # rewards
#   [False, True]  )       # dones
    states, next_states, actions, rewards, dones = experiences
    # .detach() prevents gradients flowing into the target network (we don’t train it via this path).
    # .max(1) takes the max over actions (dim=1)
  #   Returns (values, indices):
  #     values: shape [B], the best Q per next_state.
  #     indices: shape [B], argmax actions (ignored here).
  #  [0] picks the values.
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1)
    # you have to learn this fromula, its very useful ahead too
    q_targets = rewards + discount_factor * next_q_targets * (1 - dones)
    q_expected = self.local_qnetwork(states).gather(1, actions)
    # mse means the maen sq. error loss b/w actual values and the predicted values
    loss = F.mse_loss(q_expected, q_targets)
    # Clears accumulated gradients from the previous step (PyTorch accumulates by default).
    self.optimizer.zero_grad()
    # to backpropagate this loss
    loss.backward()
    # performs a single optimisation step to update the model parameters
    self.optimizer.step()
    self.soft_update(self.local_qnetwork, self.target_qnetwork, interpolation_parameter)

  def soft_update(self, local_model, target_model, interpolation_parameter):
    # like i,j these are target_param, local_param and to get the others at same time we use zip(,)
    for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
      target_param.data.copy_(interpolation_parameter * local_param.data + (1.0 - interpolation_parameter) * target_param.data)

### Initializing the DQN agent

In [8]:
# we made an object
agent = Agent(state_size, number_actions)

### Training the DQN agent

In [9]:
# max no. of episodes on which we want to train our ai
number_episodes = 2000
# we dont want our ai to get stuck in the env. so we keep a max step count for each episode
maximum_number_timesteps_per_episode = 1000
# 1st epsilon value
epsilon_starting_value  = 1.0
# then we let it decay
epsilon_ending_value  = 0.01
# decay by rhis decrement
epsilon_decay_value  = 0.995
epsilon = epsilon_starting_value
# save the 100 episodes in doubly ended queue
scores_on_100_episodes = deque(maxlen = 100)
# now we will code a little for its intelligence
for episode in range(1, number_episodes + 1):
  # this underscore is used to discard the information
  state, _ = env.reset()
  score = 0
  # this loop is for timesteps
  for t in range(maximum_number_timesteps_per_episode):
    action = agent.act(state, epsilon)
    # this underscore is used to discard the information
    next_state, reward, done, _, _ = env.step(action)
    agent.step(state, action, reward, next_state, done)
    state = next_state
    score += reward
    if done:
      break
  scores_on_100_episodes.append(score)
  epsilon = max(epsilon_ending_value, epsilon_decay_value * epsilon)
# we will do a dynamic print here
# episode by episode, we will see the cumulative reward evolving over the episodes with overwriting effect(puarni wali hata denge). We will keep the average rewards every 100 episodes.
  # this{:.2f} is for printing the average score in float with 2 decimals places .2f
#  to create the overwriting effect,we do \r(for carriage return) to start printing from the start of the line and end = ""
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)), end = "")
  # to check whether we are at 100 episodes or not. As it will print mean/avg of every 100 episodes in next line and if not, then overwrite again and again
  if episode % 100 == 0:
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))
  # see in gymnasium condition that if > 200, then we win. so print
  if np.mean(scores_on_100_episodes) >= 200.0:
    # (episode - 100 this as we want for 100 episodes calculation only
    # this will show 100 minus karne ke baad when it wins as we we did -100 here.
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode - 100, np.mean(scores_on_100_episodes)))
    # 'checkpoint.pth' we save all this in this file
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth')
    break

Episode 100	Average Score: -159.77
Episode 200	Average Score: -108.04
Episode 300	Average Score: -65.17
Episode 400	Average Score: -4.74
Episode 500	Average Score: 93.78
Episode 600	Average Score: 169.19
Episode 649	Average Score: 200.16
Environment solved in 549 episodes!	Average Score: 200.16


## Part 3 - Visualizing the results

In [10]:
# we dont need to implement this and and its just an ai code to run the video
import glob
import io
import base64
import imageio
from IPython.display import HTML, display

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v3')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

