Project Reinforcement Learning


In [3]:
import gymnasium as gym
import ale_py
import time

from utils import *
from model import *
from preprocessing import *

In [None]:


gym.register_envs(ale_py)

env = gym.make("ALE/Bowling-v5", render_mode="human")
observation, info = env.reset(seed=42)

print("Observation shape:", observation.shape)
print("Observation data type:", observation.dtype)
print("Action space:", env.action_space)

num_rounds = 1
for round_num in range(num_rounds):
    print(f"\n--- Round {round_num + 1} ---")
    terminated = False
    truncated = False
    total_reward = 0
    step = 0
    while not terminated and not truncated:
        action = env.action_space.sample()
        next_observation, reward, terminated, truncated, info = env.step(action)
        total_reward += reward
        step += 1
        print(f"Step: {step}, Action: {action}, Reward: {reward}, Terminated: {terminated}, Truncated: {truncated}")
        env.render()
        time.sleep(0.1)  # Add a small delay to see what's happening

        # In Bowling, an episode typically ends after all frames are played.
        # We'll rely on the environment's 'terminated' flag to signal the end of the game.
        if terminated or truncated:
            print(f"Round {round_num + 1} finished after {step} steps with total reward: {total_reward}")
            observation, info = env.reset()
            break

    if round_num == num_rounds - 1:
        print("\n--- End of 10 Rounds ---")
        break

env.close()

A.L.E: Arcade Learning Environment (version 0.11.0+dfae0bd)
[Powered by Stella]


Observation shape: (210, 160, 3)
Observation data type: uint8
Action space: Discrete(6)

--- Round 1 ---
Step: 1, Action: 3, Reward: 0.0, Terminated: False, Truncated: False
Step: 2, Action: 1, Reward: 0.0, Terminated: False, Truncated: False
Step: 3, Action: 1, Reward: 0.0, Terminated: False, Truncated: False
Step: 4, Action: 3, Reward: 0.0, Terminated: False, Truncated: False
Step: 5, Action: 4, Reward: 0.0, Terminated: False, Truncated: False
Step: 6, Action: 2, Reward: 0.0, Terminated: False, Truncated: False
Step: 7, Action: 3, Reward: 0.0, Terminated: False, Truncated: False
Step: 8, Action: 4, Reward: 0.0, Terminated: False, Truncated: False
Step: 9, Action: 0, Reward: 0.0, Terminated: False, Truncated: False
Step: 10, Action: 5, Reward: 0.0, Terminated: False, Truncated: False
Step: 11, Action: 4, Reward: 0.0, Terminated: False, Truncated: False
Step: 12, Action: 5, Reward: 0.0, Terminated: False, Truncated: False
Step: 13, Action: 5, Reward: 0.0, Terminated: False, Truncated: 

KeyboardInterrupt: 

In [4]:
import gymnasium as gym
import ale_py  # needed to register ALE envs

gym.register_envs(ale_py)  # required for Gymnasium to recognize ALE/Bowling-v5

env = gym.make("ALE/Bowling-v5", render_mode="rgb_array")

utils call e/ou preprocessing

This cell creates the Atari Bowling environment and wraps it with a custom preprocessing class. The environment is configured to return RGB frames (`render_mode="rgb_array"`), which are then converted to grayscale, resized to 84×84 pixels, and stacked across the last 4 frames. This processed output is suitable for training deep reinforcement learning agents that require compact, temporally-aware visual input.


In [5]:
raw_env = gym.make("ALE/Bowling-v5", render_mode="rgb_array")
env = ManualPreprocessWrapper(raw_env, frame_stack=4)

This cell initializes the Rainbow DQN agent by creating a neural network that takes 4 stacked grayscale frames of size 84×84 as input and outputs action-value distributions for the Atari Bowling environment. It also configures the model to run on a GPU if available, enabling efficient training.

In [6]:
from model import RainbowDQN
import torch

# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define input shape and action space
input_shape = (4, 84, 84)  # 4 stacked frames of 84x84 grayscale images
num_actions = env.action_space.n  # number of discrete actions in the env

# Main agent (online network)
agent = RainbowDQN(input_shape, num_actions).to(device)

# Target network (used in Bellman updates)
target_agent = RainbowDQN(input_shape, num_actions).to(device)
target_agent.load_state_dict(agent.state_dict())  # Copy weights initially
target_agent.eval()  # Disable noisy layers during inference

RainbowDQN(
  (features): Sequential(
    (0): Conv2d(4, 32, kernel_size=(8, 8), stride=(4, 4))
    (1): ReLU()
    (2): Conv2d(32, 64, kernel_size=(4, 4), stride=(2, 2))
    (3): ReLU()
    (4): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1))
    (5): ReLU()
  )
  (value_stream): Sequential(
    (0): NoisyLinear()
    (1): ReLU()
    (2): NoisyLinear()
  )
  (advantage_stream): Sequential(
    (0): NoisyLinear()
    (1): ReLU()
    (2): NoisyLinear()
  )
  (softmax): Softmax(dim=2)
)

Exploration vs. Exploitation:
- This project uses Noisy Networks (Noisy Nets) for exploration, a feature of the Rainbow DQN algorithm. Rather than manually decaying an ε in ε-greedy policies, Noisy Nets inject learnable, parameterized noise into the model’s layers. This encourages diverse behavior early in training and naturally reduces randomness as the agent becomes more confident, effectively balancing exploration and exploitation over time.

In [7]:
from torch.amp import autocast  # ✅ NEW AMP import
from torch.cuda.amp import GradScaler

# -------------------- CONFIGURATION --------------------
TOTAL_TIMESTEPS = 30_000
BATCH_SIZE = 64
LEARNING_RATE = 2e-4
EVAL_EVERY = 25_000
MAX_STEPS_PER_EPISODE = 1000
N_STEP = 2
GAMMA = 0.99
WARMUP_STEPS = 500
TARGET_UPDATE_FREQ = 20_000
LOG_FREQ = 5000
REWARD_LOG_FILE = "rainbow_bowling_rewards.csv"
SUCCESS_THRESHOLD = 5.0
# ------------------------------------------------------

# -------------------- SETUP --------------------
buffer = PrioritizedReplayBuffer()
n_step_buffer = deque(maxlen=N_STEP)
n_gamma = GAMMA ** N_STEP

state, _ = env.reset()
episode_reward = 0
episode_count = 0
episode_step_count = 0
best_reward = float('-inf')

optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

for t in range(TOTAL_TIMESTEPS):
    if t < WARMUP_STEPS:
        action = env.action_space.sample()
    else:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        action = agent.act(state_tensor)

    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    episode_step_count += 1
    done = done or (episode_step_count >= MAX_STEPS_PER_EPISODE)

    if reward != 0:
        print(f"🔥 Non-zero reward at step {t}: {reward}")

    n_step_buffer.append((state, action, reward, next_state, done))
    if len(n_step_buffer) == N_STEP:
        ns, na, nr, ns2, nd = compute_n_step_transition(n_step_buffer)
        buffer.add(ns, na, nr, ns2, nd)

    episode_reward, episode_count, state = track_reward(
        episode_reward, episode_count, reward, done, t, env, next_state
    )

    if done:
        episode_step_count = 0
        n_step_buffer.clear()

    # Training step
    if len(buffer) > BATCH_SIZE:
        s, a, r, s2, d, weights, idxs = buffer.sample(BATCH_SIZE)
        s = s.to(device)
        s2 = s2.to(device)
        a = a.to(device)
        r = r.to(device)
        d = d.to(device)
        weights = weights.to(device)

        with autocast("cuda"):  # ✅ Correct AMP usage
            dist = agent(s)
            dist = dist[range(BATCH_SIZE), a]

            with torch.no_grad():
                next_dist = target_agent(s2)
                next_q = torch.sum(next_dist * agent.supports, dim=2)
                next_action = torch.argmax(next_q, dim=1)
                next_dist = next_dist[range(BATCH_SIZE), next_action]

                Tz = r.unsqueeze(1) + GAMMA * agent.supports.unsqueeze(0) * (1 - d.unsqueeze(1))
                Tz = Tz.clamp(agent.Vmin, agent.Vmax)

                b = (Tz - agent.Vmin) / ((agent.Vmax - agent.Vmin) / agent.num_atoms)
                l = b.floor().clamp(0, agent.num_atoms - 1).long()
                u = b.ceil().clamp(0, agent.num_atoms - 1).long()

                offset = torch.arange(BATCH_SIZE, device=device).unsqueeze(1) * agent.num_atoms
                l_idx = (l + offset).view(-1)
                u_idx = (u + offset).view(-1)

                proj_dist = torch.zeros(BATCH_SIZE * agent.num_atoms, device=device)
                next_dist = next_dist.view(-1)
                b = b.view(-1)
                l = l.view(-1)
                u = u.view(-1)

                proj_dist.index_add_(0, l_idx, next_dist * (u.float() - b))
                proj_dist.index_add_(0, u_idx, next_dist * (b - l.float()))
                m = proj_dist.view(BATCH_SIZE, agent.num_atoms)

            log_dist = torch.log(dist + 1e-6)
            loss_per_sample = -(m * log_dist).sum(1)
            loss = (loss_per_sample * weights).mean()

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        agent.reset_noise()

        td_error = loss_per_sample.detach()
        buffer.update_priorities(idxs, td_error)

    if t % TARGET_UPDATE_FREQ == 0:
        target_agent.load_state_dict(agent.state_dict())

    if t % EVAL_EVERY == 0 and episode_count > 5:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f"📊 Step {t}: Avg Last 10 Ep Reward = {avg_reward:.2f}")

    if t % LOG_FREQ == 0:
        print(f"🌀 Step {t}, Buffer size: {len(buffer)}")

# -------------------- Final Evaluation & Reporting --------------------

save_rewards_csv(REWARD_LOG_FILE)
plot_rewards("Rainbow DQN – Bowling-v5 Reward Curve", window=10)
estimate_convergence_verbose(episode_rewards, threshold=SUCCESS_THRESHOLD, window=10)
compute_success_rate(episode_rewards, threshold=SUCCESS_THRESHOLD)
evaluate_agent(agent, env, episodes=5, device=device)


AssertionError: Torch not compiled with CUDA enabled

In [None]:
from torch.amp import autocast  # ✅ NEW AMP import
from torch.cuda.amp import GradScaler

# -------------------- CONFIGURATION --------------------
TOTAL_TIMESTEPS = 30_000
BATCH_SIZE = 64
LEARNING_RATE = 5e-5
EVAL_EVERY = 25_000
MAX_STEPS_PER_EPISODE = 1000
N_STEP = 3
GAMMA = 0.99
WARMUP_STEPS = 500
TARGET_UPDATE_FREQ = 20_000
LOG_FREQ = 5000
REWARD_LOG_FILE = "rainbow_bowling_rewards.csv"
SUCCESS_THRESHOLD = 5.0
# ------------------------------------------------------

# -------------------- SETUP --------------------
buffer = PrioritizedReplayBuffer()
n_step_buffer = deque(maxlen=N_STEP)
n_gamma = GAMMA ** N_STEP

state, _ = env.reset()
episode_reward = 0
episode_count = 0
episode_step_count = 0
best_reward = float('-inf')

optimizer = torch.optim.Adam(agent.parameters(), lr=LEARNING_RATE)
scaler = GradScaler()

for t in range(TOTAL_TIMESTEPS):
    if t < WARMUP_STEPS:
        action = env.action_space.sample()
    else:
        state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0).to(device)
        action = agent.act(state_tensor)

    next_state, reward, terminated, truncated, _ = env.step(action)
    done = terminated or truncated
    episode_step_count += 1
    done = done or (episode_step_count >= MAX_STEPS_PER_EPISODE)

    if reward != 0:
        print(f"🔥 Non-zero reward at step {t}: {reward}")

    n_step_buffer.append((state, action, reward, next_state, done))
    if len(n_step_buffer) == N_STEP:
        ns, na, nr, ns2, nd = compute_n_step_transition(n_step_buffer)
        buffer.add(ns, na, nr, ns2, nd)

    episode_reward, episode_count, state = track_reward(
        episode_reward, episode_count, reward, done, t, env, next_state
    )

    if done:
        episode_step_count = 0
        n_step_buffer.clear()

    # Training step
    if len(buffer) > BATCH_SIZE:
        s, a, r, s2, d, weights, idxs = buffer.sample(BATCH_SIZE)
        s = s.to(device)
        s2 = s2.to(device)
        a = a.to(device)
        r = r.to(device)
        d = d.to(device)
        weights = weights.to(device)

        with autocast("cuda"):  # ✅ Correct AMP usage
            dist = agent(s)
            dist = dist[range(BATCH_SIZE), a]

            with torch.no_grad():
                next_dist = target_agent(s2)
                next_q = torch.sum(next_dist * agent.supports, dim=2)
                next_action = torch.argmax(next_q, dim=1)
                next_dist = next_dist[range(BATCH_SIZE), next_action]

                Tz = r.unsqueeze(1) + GAMMA * agent.supports.unsqueeze(0) * (1 - d.unsqueeze(1))
                Tz = Tz.clamp(agent.Vmin, agent.Vmax)

                b = (Tz - agent.Vmin) / ((agent.Vmax - agent.Vmin) / agent.num_atoms)
                l = b.floor().clamp(0, agent.num_atoms - 1).long()
                u = b.ceil().clamp(0, agent.num_atoms - 1).long()

                offset = torch.arange(BATCH_SIZE, device=device).unsqueeze(1) * agent.num_atoms
                l_idx = (l + offset).view(-1)
                u_idx = (u + offset).view(-1)

                proj_dist = torch.zeros(BATCH_SIZE * agent.num_atoms, device=device)
                next_dist = next_dist.view(-1)
                b = b.view(-1)
                l = l.view(-1)
                u = u.view(-1)

                proj_dist.index_add_(0, l_idx, next_dist * (u.float() - b))
                proj_dist.index_add_(0, u_idx, next_dist * (b - l.float()))
                m = proj_dist.view(BATCH_SIZE, agent.num_atoms)

            log_dist = torch.log(dist + 1e-6)
            loss_per_sample = -(m * log_dist).sum(1)
            loss = (loss_per_sample * weights).mean()

        optimizer.zero_grad()
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        agent.reset_noise()

        td_error = loss_per_sample.detach()
        buffer.update_priorities(idxs, td_error)

    if t % TARGET_UPDATE_FREQ == 0:
        target_agent.load_state_dict(agent.state_dict())

    if t % EVAL_EVERY == 0 and episode_count > 5:
        avg_reward = np.mean(episode_rewards[-10:])
        print(f"📊 Step {t}: Avg Last 10 Ep Reward = {avg_reward:.2f}")

    if t % LOG_FREQ == 0:
        print(f"🌀 Step {t}, Buffer size: {len(buffer)}")

# -------------------- Final Evaluation & Reporting --------------------

save_rewards_csv(REWARD_LOG_FILE)
plot_rewards("Rainbow DQN – Bowling-v5 Reward Curve", window=10)
estimate_convergence_verbose(episode_rewards, threshold=SUCCESS_THRESHOLD, window=10)
compute_success_rate(episode_rewards, threshold=SUCCESS_THRESHOLD)
evaluate_agent(agent, env, episodes=5, device=device)
