# Deep Q-Learning for Lunar Landing

## Part 0 - Installing the required packages and importing the libraries

### Installing Gymnasium

In [1]:
!pip install gymnasium
!pip install "gymnasium[atari, accept-rom-license]"
!apt-get install -y swig
!pip install gymnasium[box2d]

Collecting gymnasium
  Downloading gymnasium-0.29.1-py3-none-any.whl.metadata (10 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading gymnasium-0.29.1-py3-none-any.whl (953 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m953.9/953.9 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium
Successfully installed farama-notifications-0.0.4 gymnasium-0.29.1
Collecting autorom~=0.4.2 (from autorom[accept-rom-license]~=0.4.2; extra == "accept-rom-license"->gymnasium[accept-rom-license,atari])
  Downloading AutoROM-0.4.2-py3-none-any.whl.metadata (2.8 kB)
Collecting shimmy<1.0,>=0.1.0 (from shimmy[atari]<1.0,>=0.1.0; extra == "atari"->gymnasium[accept-rom-license,atari])
  Downloading Shimmy-0.2.1-py3-none-any.whl.metadata (2.3 kB)
Collecting AutoROM.acce

### Importing the libraries

In [2]:
import os
import random
import numpy as np #to work with mathematics
import torch #to import pytorch
import torch.nn as nn #for neural networks
import torch.optim as optim #to import optimizer
import torch.nn.functional as F #to use functions
import torch.autograd as autograd #for stochastic gradient descent
from torch.autograd import Variable #for torch variables
from collections import deque, namedtuple #used during the training

## Part 1 - Building the AI

### Creating the architecture of the Neural Network

In [4]:
class Network(nn.Module):
  def __init__(self,state_size,action_size, seed =42):
    super(Network, self).__init__() #just to activate inheritence
    self.seed = torch.manual_seed(seed) # just to generate some random vectors
    self.fc1 = nn.Linear(state_size, 64) #first fully connected layer with 64 optimal number of neurons
    self.fc2 = nn.Linear(64, 64) #second fully connected layer with 64 optimal number of neurons
    self.fc3 = nn.Linear(64, action_size) #final fully connected layer

  def forward(self, state):
    x = F.relu(self.fc1(state)) #relu activation function
    x = F.relu(self.fc2(x)) #relu activation function
    return self.fc3(x)





## Part 2 - Training the AI

### Setting up the environment

In [5]:
import gymnasium as gym #importing gymnasium
env = gym.make('LunarLander-v2')
state_shape= env.observation_space.shape
state_size= env.observation_space.shape[0] #the number of elements in this input state.
number_actions = env.action_space.n #number of actions
print('State Shape', state_shape)
print('State size:', state_size)
print('Number of actions', number_actions)

State Shape (8,)
State size: 8
Number of actions 4


### Initializing the hyperparameters

In [9]:
learning_rate= 5e-4
minibatch_size = 100 #which refers of course to the number of observations used in one step of the training to update the model parameters.
#discount factor/gamma; Present value of future rewards.
discount_factor = 0.99
"""Replay_buffer_size= Meaning, how many experiences, including the state, action, reward, next date, and whether done or not, in the memory of the agent,
the training to sample and break the correlations in the observation sequences.Goal to stabilize and improve the training process.
"""
replay_buffer_size= int(1e5)#that's the size basically of the memory of the AI
#interpolation parameter used for the training
interpolation_parameter = 0.001

  and should_run_async(code)


### Implementing Experience Replay

In [17]:
class ReplayMemory(object):
  def __init__(self, capacity): #capacity= capacity of the memory
     # if Cuda PyTorch is available then we're gonna use the GPU. And then else we're gonna use the CPU,useful to make it faster
    self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
    self.capacity = capacity #capacity variable
    self.memory = []  #the list that will store the experiences, each one containing the state, the action, the reward, the next state, and whether we are done or not.
    self.position = 0


  #push method the method that will add those experiences into this replay memory buffer while also checking that we don't exceed the capacity,
  def push(self, event):
    #event is what basically contains the state, the action, the next state, the reward, and that Boolean done saying whether we are done or not
    self.memory.append(event) #append an event
    if len(self.memory) > self.capacity: #make sure it does not exceed the capacity
      del self.memory[0] #delete the oldest event

#randomly select a batch of experiences from the memory buffer.
  def sample(self, batch_size):
    experiences= random.sample(self.memory, k=batch_size) #we want to sample the experience from self.memory and it's going to be the number of experiences we want to have in the batch which is batch size
    #And so we're gonna extract and stack those elements one by one.
    states=torch.from_numpy(np.vstack([e[0] for e in experiences if e is not None])).float().to(self.device) #e[0] is the first element from the experiences; e is not None; need to convert them to pytorch tensors by "torch.from_numpy",.float() to make them float
    #.to(self.device) to move this to designated CPU or GPU

    #actions
    actions=torch.from_numpy(np.vstack([e[1] for e in experiences if e is not None])).long().to(self.device)#as actions can be 0,1,2,3 so, we can't make them float.Need long integer
    #rewards
    rewards=torch.from_numpy(np.vstack([e[2] for e in experiences if e is not None])).float().to(self.device) # same as states
    #next_states
    next_states=torch.from_numpy(np.vstack([e[3] for e in experiences if e is not None])).float().to(self.device) #same as states
    dones=torch.from_numpy(np.vstack([e[4] for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device) #same as states, .astype(np.uint8) to mean boolian data type and then convert them to float
    return states,next_states,actions,rewards,dones




### Implementing the DQN class

In [18]:
class Agent(): #creating our agent
  def __init__(self,state_size,action_size):
    self.device=torch.device("cuda" if torch.cuda.is_available() else "cpu")# this is to use GPU or CPU; make computation faster
    self.state_size=state_size
    self.action_size=action_size
    #Two Q networks
    self.local_qnetwork=Network(state_size,action_size).to(self.device) #creating the local network, to(self.device) to choose CPU or GPU
    self.target_qnetwork=Network(state_size,action_size).to(self.device) #creating the target network
    #optimizer
    self.optimizer=optim.Adam( self.local_qnetwork.parameters(),lr=learning_rate)
    #parameters() = which are exactly the weights of the network, meaning what will update step by step to predict better and better actions to play in order to land properly on the moon.
    self.memory=ReplayMemory(replay_buffer_size) #creating the memory ; replay_buffer_size is the capacity
    #timestep
    self.t_step=0 #step counter

  #Step method: And this is a method that will store experiences and decide when to learn from them
  def step(self,state,action,reward,next_state,done):
    #store experience in replaymemory
    self.memory.push((state,action,reward,next_state,done))
    #Increment the time step counter, which is one of our object variables here, self.t_step. We're gonna increment this time step counter and reset it every four steps, so that we can learn every four steps
    self.t_step=(self.t_step+1)%4 #We're gonna increment this time step counter and reset it every four steps
    #check if we have reached a new four steps
    if self.t_step==0:
      #And so if that's the case, then what are we gonna do? Well, we're gonna learn,
      #because we want to learn every four steps. But then remember that when we learn, we don't learn on one observation only.
      #We actually learn on a minibatch of observations. That's why we created the minibatch variable before, which we initialized to 100.
      #memory size of our memory
      if len(self.memory.memory)> minibatch_size:#self.memory is the instance of ReplayMemory and memory is the attribute of those.
        experiences= self.memory.sample(minibatch_size) #this will sample 100 experiences from the memory
        #learn from experience
        self.learn(experiences,discount_factor)

  #Act method: that will select an action based on a given state in the environment.
  def act(self,state,epsilon=0.): #0. to mean float
    state= torch.from_numpy(state).float().unsqueeze(0).to(self.device) #convert the state; torch tensor will be at the beginning; all of these values updates the state and so, usin state variable
    #unsqueeze(0) = we need to add an extra dimension which will correspond to the batch, meaning that this extra dimension will say which batch this state belongs to.
    #Local network to evaluate
    self.local_qnetwork.eval()
    #to check we are in the  inference mode (predicting mode)
    with torch.no_grad(): #any gradiant computation is disabled
      action_values= self.local_qnetwork(state) #action_values: which will be of course the actions predicted

    #training mode
    self.local_qnetwork.train() #set to training mode

    #epsilon greedy action selection policy
    if random.random() > epsilon:
      #we're gonna select the action with the highest Q value.
      return np.argmax(action_values.cpu().data.numpy()) #argmax function will take actions as input,as selection is simple, we are sending  this operation to be held in CPU
    else:
      #we're gonna select a random action.
      return random.choice(np.arange(self.action_size))


  #learn method that will update the agent's Q values based on sample experiences
  def learn(self,experiences,discount_factor):
    #unpack our sampled experiences into their respective categories. Meaning states, next states, actions, rewards, and dones.
    states, next_states, actions, rewards, dones = experiences
    #Get the maximum predicted Q values (for next states) from target model
    next_q_targets = self.target_qnetwork(next_states).detach().max(1)[0].unsqueeze(1) # detach() the action values in the tensor in order to then get the maximum of them, max(1) meaning we need max value on dimension 1, [0] to select the maximum Q values tensor, .unsqueeze(1) add a dimenstion of the batch
    #Compute Q targets for current states
    q_targets = rewards + (discount_factor * next_q_targets*(1-dones))

    #now find expected Q values from local Q network
    q_expected= self.local_qnetwork(states).gather(1,actions)#gather all respected Q values

    #compute loss
    loss = F.mse_loss(q_expected,q_targets)#mse= mean squared error loss,
    #minimize the loss; back poropagate
    self.optimizer.zero_grad() #to reset it from Adam() instance to zero, use zero_grad()
    loss.backward() #back propagate
    self.optimizer.step() #update the model parameters
    self.soft_update(self.local_qnetwork,self.target_qnetwork,interpolation_parameter) #update the target network parameters

  #update the target network parameters
  def soft_update(self,local_model,target_model,interpolation_parameter):
    #loop through local and target parameters
    for target_param, local_param in zip(target_model.parameters(),local_model.parameters()):
      #soft update
      #soft update consists of well softly update the target model parameters using the weighted average of the local and target parameters
      target_param.data.copy_(interpolation_parameter*local_param.data + (1.0-interpolation_parameter)*target_param.data)














### Initializing the DQN agent

In [19]:
agent= Agent(state_size,number_actions)

### Training the DQN agent

In [20]:
number_episodes= 2000 #number of episodes ; which is actually the maximum number of episodes over which we want to train our agent.
#the maximum number of times steps per episode
max_number_timesteps_per_episode = 1000 #In any attempt on landing on the moon, there's gonna be maximum 1000 times steps.

#we will reduce till 0.01 ; reduce epsilon little by little to test other epsilon values
epsilon_starting_value=1.0
epsilon_ending_value= 0.01
epsilon_decay_value = 0.995 # it will help decaying epsilon . for example 1* 0.995= 0.995
epsilon= epsilon_starting_value

#window of scores on 100 episodes
scores_on_100_episodes= deque(maxlen=100)# double-ended queue


#main
for episode in range(1,number_episodes+1): #from first episode to last
  #reset the environment
  state, _ = env.reset() #reset environment to initial state, state gets initial state and , _ gets some other info which is not needed
  #initialize score
  score=0
  #loop over timesteps
  for t in range(max_number_timesteps_per_episode):
    #select an action
    action= agent.act(state,epsilon)
    #once it takes an action, it moves to a new state, get rewards etc
    next_state,reward,done,_,_= env.step(action)
    #training
    agent.step(state,action,reward,next_state,done)
    #now change the state to new
    #update score
    state=next_state
    score+=reward
    # if the episode is done at this specific time step, well we'll simply do a break,
    if done:
      break
  # append the score of that finished episode to that window of the scores on 100 episodes
  scores_on_100_episodes.append(score)
  #reduce epsilon
  epsilon= max(epsilon_ending_value,epsilon_decay_value*epsilon)
  print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode,np.mean(scores_on_100_episodes)), end="")# episode has value of episode
        #average score np.mean(scores_on_100_episodes)
        #\r will create dynamic effect, end="" will ensure we don't go to new line
  if episode % 100 == 0: # that means that we are every 100 episode
    print('\rEpisode {}\tAverage Score: {:.2f}'.format(episode, np.mean(scores_on_100_episodes)))

  if np.mean(scores_on_100_episodes) >= 200.0: # if the average scores_on_100_episodes is larger than 200, well time to say we win
    print('\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(episode-100, np.mean(scores_on_100_episodes)))# you can keep episode or put episode-100
    #if this average of the scores_on_100_episodes is larger than 200, that means that in fact, we actually started winning from this episode number minus 100 because this is a score over 100 episodes.
    torch.save(agent.local_qnetwork.state_dict(), 'checkpoint.pth') #save the model parameters
    break










Episode 100	Average Score: -167.04
Episode 200	Average Score: -86.62
Episode 300	Average Score: -31.85
Episode 400	Average Score: 20.35
Episode 500	Average Score: 106.50
Episode 600	Average Score: 182.56
Episode 627	Average Score: 200.79
Environment solved in 527 episodes!	Average Score: 200.79


## Part 3 - Visualizing the results

In [23]:
import glob
import io
import base64
import imageio
from IPython.display import HTML, display
from gym.wrappers.monitoring.video_recorder import VideoRecorder

def show_video_of_model(agent, env_name):
    env = gym.make(env_name, render_mode='rgb_array')
    state, _ = env.reset()
    done = False
    frames = []
    while not done:
        frame = env.render()
        frames.append(frame)
        action = agent.act(state)
        state, reward, done, _, _ = env.step(action.item())
    env.close()
    imageio.mimsave('video.mp4', frames, fps=30)

show_video_of_model(agent, 'LunarLander-v2')

def show_video():
    mp4list = glob.glob('*.mp4')
    if len(mp4list) > 0:
        mp4 = mp4list[0]
        video = io.open(mp4, 'r+b').read()
        encoded = base64.b64encode(video)
        display(HTML(data='''<video alt="test" autoplay
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
    else:
        print("Could not find video")

show_video()

