In [1]:
from utils import initialize_model, train_stepLR, train_cosine, perform_train
import gym
import pandas as pd
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from skimage.color import rgb2gray
from skimage.transform import resize
import random
import torch.optim as optim
import torch.optim as optim
import csv

cuda


In [2]:
# Define the device for PyTorch computations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
all_env = list(gym.envs.registry.keys())

print('Total Environments in Gym version {} : {}'.format(gym.__version__,len(all_env)))

Total Environments in Gym version 0.25.2 : 996


## Let's start the environment

In [4]:
#define environment
env = gym.make("AssaultDeterministic-v4", render_mode="rgb_array")

obs = env.reset()

# Inspect the observation
print("Observation shape:", obs.shape)
print("Observation dtype:", obs.dtype)

#Inspect action space
num_actions = env.action_space.n
print("Number of possible actions:", num_actions)

Observation shape: (210, 160, 3)
Observation dtype: uint8
Number of possible actions: 7


In [5]:
#Now we define the hyper parameters:
learning_rate = 0.0001
gamma = 0.99                      # Discount factor
epsilon = 1.0                     # Initial exploration rate
epsilon_decay = 0.995             # Epsilon decay rate
epsilon_min = 0.1                 # Minimum epsilon
replay_buffer_size = 100000       # Replay buffer size
batch_size = 1                  # Batch size for training
target_update_frequency = 1000    # Update target network after these many steps
max_episodes = 1000               # Maximum number of episodes
max_steps_per_episode = 10000     # Maximum steps per episode

Now we define the Q-Network based on the paper architecture

In [6]:
class DQN(nn.Module):
    def __init__(self, input_shape, num_actions):
        """
        Initialize the DQN network.
        
        :param input_shape: Tuple representing the input shape (channels, height, width).
        :param num_actions: Number of possible actions (output Q-values for each action).
        """
        super(DQN, self).__init__()
        
        # First convolutional layer: 32 filters of size 8x8 with stride 4
        self.conv1 = nn.Conv2d(in_channels=input_shape[0], out_channels=32, kernel_size=8, stride=4)
        
        # Second convolutional layer: 64 filters of size 4x4 with stride 2
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=4, stride=2)
        
        # Third convolutional layer: 64 filters of size 3x3 with stride 1
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1)
        
        # Calculate the flattened size after convolutions
        def conv_output_size(size, kernel_size, stride, padding=0):
            return (size - kernel_size + 2 * padding) // stride + 1

        conv_h = conv_output_size(
            conv_output_size(conv_output_size(input_shape[1], 8, 4), 4, 2), 3, 1
        )
        conv_w = conv_output_size(
            conv_output_size(conv_output_size(input_shape[2], 8, 4), 4, 2), 3, 1
        )
        linear_input_size = conv_h * conv_w * 64

        # Fully connected layer for output Q-values
        self.fc = nn.Linear(linear_input_size, num_actions)

    def forward(self, x):
        """
        Forward pass through the network.
        
        :param x: Input tensor of shape (batch_size, channels, height, width).
        :return: Tensor of shape (batch_size, num_actions) representing Q-values.
        """
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = F.relu(self.conv3(x))
        x = x.contiguous().view(x.size(0), -1)  # Flatten for the fully connected layer
        x = self.fc(x)
        return x

Pre processing the observation

In [7]:
def preprocess_observation(obs):
    """
    Preprocesses an Atari observation from (210, 160, 3) to (84, 84, 1) grayscale.

    :param obs: Raw observation from the environment (NumPy array of shape (210, 160, 3)).
    :return: Preprocessed observation (NumPy array of shape (84, 84, 1)).
    """
    # Convert to grayscale
    gray_obs = rgb2gray(obs)  # Shape: (210, 160)
    
    # Resize to 84x84
    resized_obs = resize(gray_obs, (84, 84), anti_aliasing=True)  # Shape: (84, 84)
    
    # Normalize pixel values to [0, 1]
    normalized_obs = resized_obs / 255.0
    
    # Add a channel dimension to make it (84, 84, 1)
    #preprocessed_obs = np.expand_dims(normalized_obs, axis=-1)
    
    return normalized_obs

In [8]:

class ReplayBuffer:
    def __init__(self, capacity):
        """
        Initialize the replay buffer.

        :param capacity: Maximum number of transitions the buffer can hold.
        """
        self.capacity = capacity
        self.buffer = []
        self.position = 0  # Tracks the next position to overwrite when the buffer is full

    def store(self, state, action, reward, next_state, done):
        """
        Store a transition in the replay buffer.

        :param state: Current state (preprocessed).
        :param action: Action taken.
        :param reward: Reward received.
        :param next_state: Next state (preprocessed).
        :param done: Whether the episode is done.
        """
        # Create a tuple for the transition
        transition = (state, action, reward, next_state, done)

        # If the buffer isn't full, add the transition
        if len(self.buffer) < self.capacity:
            self.buffer.append(transition)
        else:
            # Overwrite the oldest transition
            self.buffer[self.position] = transition

        # Update the position to overwrite
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
    
        return random.sample(self.buffer, batch_size)
    
    

    def size(self):

        return len(self.buffer)

In [9]:
class DQNAgent:
    def __init__(self, env, replay_buffer, input_shape, num_actions, batch_size=32, gamma=0.99, lr=0.0001, target_update_frequency=1000, n_frames=1):
        """
        Initialize the DQN agent.

        :param env: The Gym environment.
        :param replay_buffer: ReplayBuffer instance for experience storage.
        :param input_shape: Shape of the input state (e.g., (1, 84, 84)).
        :param num_actions: Number of possible actions in the environment.
        :param batch_size: Batch size for training.
        :param gamma: Discount factor for future rewards.
        :param lr: Learning rate for the optimizer.
        :param target_update_frequency: Steps between target network updates.
        """
        self.env = env
        self.replay_buffer = replay_buffer
        self.batch_size = batch_size
        self.gamma = gamma
        self.target_update_frequency = target_update_frequency
        self.num_actions = num_actions
        self.n_frames = n_frames

        # Q-network and target network
        self.q_network = DQN(input_shape, num_actions).to(device)
        self.target_network = DQN(input_shape, num_actions).to(device)
        self.target_network.load_state_dict(self.q_network.state_dict())  # Initialize with same weights
        self.target_network.eval()  # Target network doesn't train

        # Optimizer
        self.optimizer = optim.Adam(self.q_network.parameters(), lr=lr)

        # Epsilon-greedy parameters
        self.epsilon = 1.0  # Start with full exploration
        self.epsilon_decay = 0.995
        self.epsilon_min = 0.1

        # Training step counter
        self.step_count = 0
        
        # Initialize the frame stack
        self.frame_stack = []
        
    
    def reset_frame_stack(self):
        """Reset the frame stack."""
        initial_frame = preprocess_observation(self.env.reset()) 
        self.frame_stack = [initial_frame] * self.n_frames  # Duplicate the first frame

    def stack_frames(self, new_frame):
        """Update the frame stack with a new frame."""
        self.frame_stack.pop(0)  # Remove the oldest frame
        self.frame_stack.append(new_frame)  # Add the new frame
        
        return np.stack(self.frame_stack, axis=0)

    def select_action(self, stacked_state):
        """
        Select an action using epsilon-greedy policy.

        :param state: Current state of the environment.
        :return: Chosen action.
        """
        
        #state = self.stack_frames(state)
        
        if random.random() < self.epsilon:
            return self.env.action_space.sample()  # Explore: random action
        else:
            state_tensor = torch.tensor(stacked_state, dtype=torch.float32).unsqueeze(0).to(device)
            #state_tensor = state_tensor.permute(0, 3, 1, 2) 
            #print("after permute state_tensor", state_tensor.size())
            with torch.no_grad():
                q_values = self.q_network(state_tensor)
            return torch.argmax(q_values).item()  # Exploit: max Q-value action
        

    def train_step(self):
        """
        Perform one training step.
        """
        if self.replay_buffer.size() < self.batch_size:
            return  # Not enough data to train
        
        # Sample a batch from the replay buffer
        batch = self.replay_buffer.sample(self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        # Sample a batch from the replay buffer
        #single_transition = replay_buffer.sample(batch_size=1)[0]  
        #state, action, reward, next_state, done = single_transition

        # Convert to tensors
        states = torch.tensor(np.array(states), dtype=torch.float32).to(device)
        actions = torch.tensor(actions, dtype=torch.int64).to(device)
        rewards = torch.tensor(rewards, dtype=torch.float32).to(device)
        next_states = torch.tensor(np.array(next_states), dtype=torch.float32).to(device)
        dones = torch.tensor(dones, dtype=torch.float32).to(device)

        # Compute current Q-values
        #print("Initial state print before permutation ", states.shape)
        #states = states.permute(0, 3, 1, 2)  
        #print("Initial state print after permutation ", states.shape)
        
        q_values = self.q_network(states)
        #print("Q_values size", q_values.size())
        #actions = actions.unsqueeze(0) 
        #print("action size", actions.size())
        q_values = q_values.gather(1, actions.unsqueeze(1)).squeeze(1)
        
        
        #formating for the network
        #print("next state before permutation ", next_states.size())

        # Compute target Q-values
        with torch.no_grad():
            next_q_values = self.target_network(next_states).max(1)[0]
        target_q_values = rewards + self.gamma * next_q_values * (1 - dones)

        # Compute loss and backpropagate
        loss = torch.nn.functional.mse_loss(q_values, target_q_values)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()

        # Update target network
        self.step_count += 1
        if self.step_count % self.target_update_frequency == 0:
            self.target_network.load_state_dict(self.q_network.state_dict())

        return loss.item()

In [10]:
def train_dqn(agent, num_episodes, max_steps_per_episode, csv_filename):
    """
    Train the DQN agent.

    :param agent: The DQNAgent instance.
    :param num_episodes: Number of episodes to train.
    :param max_steps_per_episode: Maximum steps per episode.
    """
    
    # Initialize the CSV file with a header
    with open(csv_filename, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["episode", "total_reward", "loss", "epsilon"])  # Write the header row
        
        
    for episode in range(num_episodes):
        
        # Start the stacked frames
        agent.reset_frame_stack()
        agent_reset= agent.env.reset()
        #print(agent_reset.shape)
        initial_frame = preprocess_observation(agent_reset) # Preprocess the initial observation
        #print("Initial frame ", initial_frame.shape)
        state = agent.stack_frames(initial_frame)  
        
        
        total_reward = 0
        loss = 0

        for step in range(max_steps_per_episode):
            # Select an action using the epsilon-greedy policy
            action = agent.select_action(state)
            
            # Take the action in the environment
            next_frame, reward, done, _ = agent.env.step(action)
            next_frame = preprocess_observation(next_frame)
            #print("frame ", next_frame.shape)

            # Update the frame stack with the new frame
            next_state = agent.stack_frames(next_frame)
            #print(next_state.shape)

            # Store the transition in the replay buffer
            agent.replay_buffer.store(state, action, reward, next_state, done)

            # Train the network
            loss = agent.train_step()

            # Update total reward
            total_reward += reward

            # Update the current state
            state = next_state

            # Break the loop if the episode is done
            if done:
                break

        # Decay epsilon
        agent.epsilon = max(agent.epsilon * agent.epsilon_decay, agent.epsilon_min)
        
         # Save results to CSV
        with open(csv_filename, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([episode + 1, total_reward, loss, agent.epsilon])  # Append the row

        print(f"Episode {episode + 1}/{num_episodes} - Reward: {total_reward:.2f}, Loss: {loss:.4f}, Epsilon: {agent.epsilon:.4f}")

        
    print(f"Training results saved to {csv_filename}")

In [None]:
replay_buffer = ReplayBuffer(capacity=100000)
csv_filename = "dqn_results_4frames.csv"
n_frames = 4

# Initialize agent
input_shape = (n_frames, 84, 84)  # 4 stacked frames
num_actions = env.action_space.n
agent = DQNAgent(env, replay_buffer, input_shape, num_actions, n_frames=n_frames)

# Train the agent
num_episodes = 1000
max_steps_per_episode = 10000
train_dqn(agent, num_episodes, max_steps_per_episode, csv_filename=csv_filename)

# Evaluate the agent
#evaluate_agent(agent, num_episodes=10, max_steps_per_episode=10000)

Episode 1/1000 - Reward: 336.00, Loss: 13.0383, Epsilon: 0.9950
Episode 2/1000 - Reward: 399.00, Loss: 26.6541, Epsilon: 0.9900
Episode 3/1000 - Reward: 168.00, Loss: 13.5869, Epsilon: 0.9851
Episode 4/1000 - Reward: 168.00, Loss: 0.4905, Epsilon: 0.9801
Episode 5/1000 - Reward: 231.00, Loss: 0.1777, Epsilon: 0.9752
Episode 6/1000 - Reward: 252.00, Loss: 0.0997, Epsilon: 0.9704
Episode 7/1000 - Reward: 315.00, Loss: 13.5983, Epsilon: 0.9655
Episode 8/1000 - Reward: 294.00, Loss: 0.1660, Epsilon: 0.9607
Episode 9/1000 - Reward: 252.00, Loss: 13.6294, Epsilon: 0.9559
Episode 10/1000 - Reward: 252.00, Loss: 13.4894, Epsilon: 0.9511
Episode 11/1000 - Reward: 273.00, Loss: 0.1416, Epsilon: 0.9464
Episode 12/1000 - Reward: 189.00, Loss: 0.0726, Epsilon: 0.9416
Episode 13/1000 - Reward: 399.00, Loss: 26.2343, Epsilon: 0.9369
Episode 14/1000 - Reward: 252.00, Loss: 0.2070, Epsilon: 0.9322
Episode 15/1000 - Reward: 294.00, Loss: 26.8117, Epsilon: 0.9276
Episode 16/1000 - Reward: 84.00, Loss: 13

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the CSV results
df = pd.read_csv("dqn_results_4frames.csv")

# Compute averages of total rewards every 50 episodes
df["group"] = df["episode"] // 50  # Group by every 50 episodes
average_rewards = df.groupby("group")["total_reward"].mean()  # Average rewards
average_episodes = df.groupby("group")["episode"].mean()  # Midpoint for episodes

# Plot the average total rewards over grouped episodes
plt.plot(average_episodes, average_rewards, label="DQN 10-frames")
plt.xlabel("Episode")
plt.ylabel("Average Total Reward")
plt.title("Average Total Reward")
plt.legend()
plt.show()

In [None]:
def evaluate_agent(agent, num_episodes, max_steps_per_episode):
    """
    Evaluate the DQN agent with stacked frames.

    :param agent: The DQNAgent instance.
    :param num_episodes: Number of evaluation episodes.
    :param max_steps_per_episode: Maximum steps per episode.
    """
    agent.epsilon = 0.0  # Turn off exploration
    total_rewards = []

    for episode in range(num_episodes):
        # Reset the environment and initialize the frame stack
        agent.reset_frame_stack()
        initial_frame = preprocess_observation(agent.env.reset())
        state = agent.stack_frames(initial_frame)  # Initialize the stacked state

        total_reward = 0

        for step in range(max_steps_per_episode):
            # Select the best action
            action = agent.select_action(state)

            # Take the action in the environment
            next_frame, reward, done, _ = agent.env.step(action)
            next_frame = preprocess_observation(next_frame)

            # Update the frame stack with the new frame
            state = agent.stack_frames(next_frame)

            # Update the total reward
            total_reward += reward

            if done:
                break

        total_rewards.append(total_reward)
        print(f"Evaluation Episode {episode + 1}/{num_episodes} - Reward: {total_reward:.2f}")

    print(f"Average Reward over {num_episodes} Episodes: {np.mean(total_rewards):.2f}")

    



In [None]:
evaluate_agent(agent, num_episodes=10, max_steps_per_episode=10000)