In [1]:
#pip install swig
#pip install gym[box2d]
#pip install keras-rl

In [2]:
import numpy as np
import pickle
import torch as torch
import torch.nn as nn
import torch.nn.functional as tnn
import numpy as np

In [3]:
class QNetwork(nn.Module):
    def __init__(self, learning_rate, state_dimensions, fc1_dims, fc2_dims, num_actions):
        super(QNetwork, self).__init__()

        self.state_dimensions = state_dimensions
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.num_actions = num_actions

        # Define the neural network layers
        self.fc1_layer = nn.Linear(*self.state_dimensions, self.fc1_dims)
        self.fc2_layer = nn.Linear(self.fc1_dims, self.fc2_dims)
        self.fc3_layer = nn.Linear(self.fc2_dims, self.num_actions)

        # Optimizer/loss function
        self.optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        self.loss_function = nn.MSELoss()

        # Device specification
        self.device = 'cpu'
        self.to(self.device)

    def fwd(self, state):
        # Forward pass through the network
        x = tnn.relu(self.fc1_layer(state))
        x = tnn.relu(self.fc2_layer(x))
        actions = self.fc3_layer(x)

        return actions

In [4]:
class ReinforcementLearningAgent():
    def __init__(self, discount_factor, exploration_rate, learning_rate, state_dimensions, batch_size, num_actions,
                 max_memory_size=100000, exploration_end=0.01, exploration_decay=5e-4, q_network=None):

        # Initialize the reinforcement learning agent
        self.discount_factor = discount_factor
        self.exploration_rate = exploration_rate
        self.learning_rate = learning_rate
        self.state_dimensions = state_dimensions
        self.action_space = [i for i in range(num_actions)]
        self.max_memory_size = max_memory_size
        self.batch_size = batch_size
        self.memory_counter = 0
        self.exploration_end = exploration_end
        self.exploration_decay = exploration_decay

        # Q-network for evaluation
        self.q_network = q_network if q_network else QNetwork(
            self.learning_rate, num_actions=num_actions, state_dimensions=state_dimensions, fc1_dims=256, fc2_dims=256
        )

        # Replay memory
        self.state_memory = np.zeros((self.max_memory_size, *state_dimensions), dtype=np.float32)
        self.new_state_memory = np.zeros((self.max_memory_size, *state_dimensions), dtype=np.float32)
        self.action_memory = np.zeros(self.max_memory_size, dtype=np.int32)
        self.reward_memory = np.zeros(self.max_memory_size, dtype=np.float32)
        self.terminal_memory = np.zeros(self.max_memory_size, dtype=bool)

    def store_experience(self, current_state, action, reward, new_state, is_terminal):
        # Store a new experience in the replay memory
        index = self.memory_counter % self.max_memory_size
        self.state_memory[index] = current_state
        self.new_state_memory[index] = new_state
        self.reward_memory[index] = reward
        self.action_memory[index] = action
        self.terminal_memory[index] = is_terminal
        self.memory_counter += 1

    def choose_action(self, observation):
        # Choose an action based on epsilon-greedy strategy
        random_value = np.random.random()
        if random_value > self.exploration_rate:
            current_state = torch.tensor([observation]).to(self.q_network.device)
            actions = self.q_network.fwd(current_state)
            chosen_action = torch.argmax(actions).item()
        else:
            chosen_action = np.random.choice(self.action_space)
        return chosen_action

    def update_q_network(self):
        # Update the Q-network using a batch of experiences from the replay memory
        if self.memory_counter < self.batch_size:
            return

        self.q_network.optimizer.zero_grad()

        max_memory = min(self.memory_counter, self.max_memory_size)
        batch_indices = np.random.choice(max_memory, self.batch_size, replace=False)
        batch_index = np.arange(self.batch_size, dtype=np.int32)

        current_state_batch = torch.tensor(self.state_memory[batch_indices]).to(self.q_network.device)
        new_state_batch = torch.tensor(self.new_state_memory[batch_indices]).to(self.q_network.device)
        reward_batch = torch.tensor(self.reward_memory[batch_indices]).to(self.q_network.device)
        terminal_batch = torch.tensor(self.terminal_memory[batch_indices]).to(self.q_network.device)
        action_batch = self.action_memory[batch_indices]

        q_values_current_state = self.q_network.fwd(current_state_batch)[batch_index, action_batch]
        q_values_new_state = self.q_network.fwd(new_state_batch)
        q_values_new_state[terminal_batch] = 0.0

        q_target = reward_batch + self.discount_factor * torch.max(q_values_new_state, dim=1)[0]

        loss = self.q_network.loss_function(q_target, q_values_current_state).to(self.q_network.device)
        loss.backward()
        self.q_network.optimizer.step()

        self.exploration_rate = max(self.exploration_rate - self.exploration_decay, self.exploration_end)

In [5]:
from gym.wrappers import RecordVideo
def train_lunar_lander_agent():

    # Create the LunarLander environment
    lunar_lander_env = gym.make("LunarLander-v2", render_mode='rgb_array')
    lunar_lander_env = RecordVideo(lunar_lander_env, 'video')  # Assuming RecordVideo is a custom wrapper for video recording

    # Initialize the reinforcement learning agent
    landing_agent = ReinforcementLearningAgent(
        discount_factor = 0.99, exploration_rate=1.0, learning_rate=0.0002, state_dimensions=[8],
        num_actions=4, max_memory_size = 1000000, batch_size = 64, exploration_end = 0.04, exploration_decay=0.8
    )

    episode_scores, epsilon_history = [], []
    num_episodes = 300

    # Run training for a specified number of episodes
    for episode in range(num_episodes):
        total_score = 0
        episode_done = False
        current_observation = lunar_lander_env.reset(seed=42)

        while not episode_done:
            # Choose an action based on the agent's policy
            selected_action = landing_agent.choose_action(current_observation)

            # Take the chosen action and observe the new state and reward
            new_observation, reward, episode_terminated, truncated = lunar_lander_env.step(selected_action)
            total_score += reward

            # Check if the episode is done (either terminated or truncated)
            episode_done = episode_terminated or truncated

            # Store the transition in the agent's replay memory and update the Q-network
            landing_agent.store_experience(current_observation, selected_action, reward, new_observation, episode_done)
            landing_agent.update_q_network()

            current_observation = new_observation

        # Record scores and epsilon values for analysis
        episode_scores.append(total_score)
        epsilon_history.append(landing_agent.exploration_rate)
        print('Episode score: ', total_score)

    lunar_lander_env.close()

In [6]:
import sys
sys.path.append('/usr/local/lib/python3.10/dist-packages')
import gym
train_lunar_lander_agent()

  and should_run_async(code)
  deprecation(
  deprecation(
  logger.warn(
  current_state = torch.tensor([observation]).to(self.q_network.device)


Episode score:  -641.5298915229102
Episode score:  -230.1482760617113
Episode score:  -682.1084742371035
Episode score:  -494.2570911835446
Episode score:  -105.1321215580107
Episode score:  -119.25314439205735
Episode score:  -145.74621382358728
Episode score:  -143.66854751800926
Episode score:  -324.0776087396532
Episode score:  -562.1780806885058
Episode score:  -240.6936461764117
Episode score:  -312.1204966813717
Episode score:  -100.17789720392433
Episode score:  194.63070617397045
Episode score:  -250.77126901330712
Episode score:  -338.9177413944261
Episode score:  -259.84674415650943
Episode score:  -86.83894954272257
Episode score:  -97.02410438017736
Episode score:  -316.6907383668686
Episode score:  -245.65779633941656
Episode score:  -172.43830679931364
Episode score:  -386.14951886222093
Episode score:  -155.81112773293557
Episode score:  -153.55647401985516
Episode score:  -135.30200720334702
Episode score:  -144.74100641629175
Episode score:  -135.89541997611292
Episod