In [3]:
import gym
import numpy as np
import math
import torch
import random
import matplotlib.pyplot as plt
# import pybullet_envs
import pybullet as p
import simple_driving
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Load the environment:
from custom_env import CustomDrivingEnv
env = CustomDrivingEnv()
state = env.reset()
for _ in range(200):
    action = env.action_space.sample()  # Replace with your action selection mechanism
    state, reward, done, info = env.step(action)
    if done:
        state = env.reset()
env.close()

# Hyperparameters
EPISODES = 2500
LEARNING_RATE = 0.00025
MEM_SIZE = 50000
REPLAY_START_SIZE = 10000
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 0.1
EPS_END = 0.0001
EPS_DECAY = 4 * MEM_SIZE
MEM_RETAIN = 0.1
NETWORK_UPDATE_ITERS = 5000

FC1_DIMS = 128
FC2_DIMS = 128

# metrics for displaying training status
best_reward = 0
average_reward = 0
episode_history = []
episode_reward_history = []
np.bool = np.bool_


# Environment initialisation
# env = gym.make('SimpleDriving-v0', apply_api_compatibility=True, renders=False, isDiscrete=True, render_mode='tp_camera')
#env = gym.make('SimpleDriving-v0', apply_api_compatibility=True, renders=True, isDiscrete=True)
# env = gym.make('SimpleDriving-v0', apply_api_compatibility=True, renders=False, isDiscrete=True)

class Network(torch.nn.Module):
    def __init__(self, env):
        super().__init__()
        self.input_shape = env.observation_space.shape
        self.action_space = env.action_space.n

        self.layers = torch.nn.Sequential(
            torch.nn.Linear(*self.input_shape, FC1_DIMS),
            torch.nn.ReLU(),
            torch.nn.Linear(FC1_DIMS, FC2_DIMS),
            torch.nn.ReLU(),
            torch.nn.Linear(FC2_DIMS, self.action_space)
            )

        self.optimizer = torch.optim.Adam(self.parameters(), lr=LEARNING_RATE)
        self.loss = torch.nn.MSELoss()

    def forward(self, x):
        return self.layers(x)

class ReplayBuffer:
    def __init__(self, env):
        self.mem_count = 0
        #self.states = np.zeros((MEM_SIZE, *env.observation_space.shape), dtype=np.float32)
        self.states = np.zeros((MEM_SIZE, 240, 320), dtype=np.uint8)  # Adjusted size and type
        self.actions = np.zeros(MEM_SIZE, dtype=np.int64)
        self.rewards = np.zeros(MEM_SIZE, dtype=np.float32)
        self.states_ = np.zeros((MEM_SIZE, *env.observation_space.shape), dtype=np.float32)
        self.dones = np.zeros(MEM_SIZE, dtype=bool)  # Change np.bool to bool

    def add(self, state, action, reward, state_, done):
        if self.mem_count < MEM_SIZE:
            mem_index = self.mem_count
        else:
            mem_index = int(self.mem_count % ((1 - MEM_RETAIN) * MEM_SIZE) + (MEM_RETAIN * MEM_SIZE))

        self.states[mem_index] = state
        self.actions[mem_index] = action
        self.rewards[mem_index] = reward
        self.states_[mem_index] = state_
        self.dones[mem_index] = 1 - done  # Update this line as well

        self.mem_count += 1

    # returns random samples from the replay buffer, number is equal to BATCH_SIZE
    def sample(self):
        MEM_MAX = min(self.mem_count, MEM_SIZE)
        batch_indices = np.random.choice(MEM_MAX, BATCH_SIZE, replace=True)

        states = self.states[batch_indices]
        actions = self.actions[batch_indices]
        rewards = self.rewards[batch_indices]
        states_ = self.states_[batch_indices]
        dones = self.dones[batch_indices]

        return states, actions, rewards, states_, dones

class DQN_Solver:
    def __init__(self, env):
        self.memory = ReplayBuffer(env)
        self.policy_network = Network(env)
        self.target_network = Network(env)
        self.target_network.load_state_dict(self.policy_network.state_dict())
        self.learn_count = 0

    def choose_action(self, observation):
        if self.memory.mem_count > REPLAY_START_SIZE:
            eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * self.learn_count / EPS_DECAY)
        else:
            eps_threshold = 1.0

        if random.random() < eps_threshold:
            return np.random.choice(np.array(range(9)), p = [0.11, 0.165, 0.11, 0.11, 0.01, 0.11,0.11,0.165, 0.11])
            # return np.random.choice(np.array(range(self.policy_network.action_space)))
            # return np.random.choice(np.array(range(2)), p=[0.9,0.1])


        state = torch.tensor(observation).float().detach()
        state = state.unsqueeze(0)
        self.policy_network.eval()
        with torch.no_grad():
            q_values = self.policy_network(state)
        return torch.argmax(q_values).item()
       
    # main training loop
    def learn(self):
        states, actions, rewards, states_, dones = self.memory.sample()
        states = torch.tensor(states, dtype=torch.float32)
        actions = torch.tensor(actions, dtype=torch.long)
        rewards = torch.tensor(rewards, dtype=torch.float32)
        states_ = torch.tensor(states_, dtype=torch.float32)
        dones = torch.tensor(dones, dtype=torch.bool)
        batch_indices = np.arange(BATCH_SIZE, dtype=np.int64)

        self.policy_network.train(True)
        q_values = self.policy_network(states)
        q_values = q_values[batch_indices, actions]

        self.target_network.eval()
        with torch.no_grad():
            q_values_next = self.target_network(states_)

        q_values_next_max = torch.max(q_values_next, dim=1)[0]

        q_target = rewards + GAMMA * q_values_next_max * dones

        loss = self.policy_network.loss(q_values, q_target)
        #compute gradients and update policy network Q weights
        self.policy_network.optimizer.zero_grad()
        loss.backward()
        self.policy_network.optimizer.step()
        self.learn_count += 1

        # set target network \hat{Q}'s weights to policy network Q's weights every C steps
        if  self.learn_count % NETWORK_UPDATE_ITERS == NETWORK_UPDATE_ITERS - 1:
            print("updating target network")
            self.update_target_network()

    def update_target_network(self):
        self.target_network.load_state_dict(self.policy_network.state_dict())

    def returning_epsilon(self):
        return self.exploration_rate

# Training loop

# set manual seeds so we get same behaviour everytime - so that when you change your hyper parameters you can attribute the effect to those changes
env.action_space.seed(0)
random.seed(0)
np.random.seed(0)
torch.manual_seed(0)
episode_batch_score = 0
episode_reward = 0
agent = DQN_Solver(env)

## Load the policy_network
agent.policy_network.load_state_dict(torch.load("policy_network1.pkl"))


for i in range(EPISODES):
    state, info = env.reset()

    while True:
        action = agent.choose_action(state)
        state_, reward, done, info, empty = env.step(action)
        # reward += -abs(state_[1])
        # if done == True:
        #   reward = 50
        agent.memory.add(state, action, reward, state_, done)
        
        # only start learning once replay memory reaches REPLAY_START_SIZE
        if agent.memory.mem_count > REPLAY_START_SIZE:
            agent.learn()

        state = state_

        episode_batch_score += reward
        episode_reward += reward

        if done:
            break

    episode_history.append(i)
    episode_reward_history.append(episode_reward)
    episode_reward = 0.0

    # save our model every batches of 100 episodes so we can load later. (note: you can interrupt the training any time and load the latest saved model when testing)
    if i % 100 == 0 and agent.memory.mem_count > REPLAY_START_SIZE:
        torch.save(agent.policy_network.state_dict(), "policy_network1.pkl")
        print("average total reward per episode batch since episode ", i, ": ", episode_batch_score/ float(100))
        episode_batch_score = 0
    elif agent.memory.mem_count < REPLAY_START_SIZE:
        print("waiting for buffer to fill...")
        episode_batch_score = 0

plt.plot(episode_history, episode_reward_history)
plt.show()


MemoryError: Unable to allocate 172. GiB for an array with shape (50000, 480, 640, 3) and data type float32

In [None]:
## Test the learnt policy
env = gym.make('SimpleDriving-v0', apply_api_compatibility=True, renders=True, isDiscrete=True)

agent = DQN_Solver(env)

agent.policy_network.load_state_dict(torch.load("policy_network1.pkl"))

#frames = []
state, info = env.reset()
agent.policy_network.eval()

while True:
    with torch.no_grad():
        q_values = agent.policy_network(torch.tensor(state, dtype=torch.float32))
    action = torch.argmax(q_values).item() # select action with highest predicted q-value
    state, reward, done, info, empty = env.step(action)
    #frames.append(np.fliplr(np.rot90(env.render(mode="rgb_array"), 3)))
    if done:
        break

env.close()

# # display_video(frames)
