In [1]:
import numpy as np
import pufferlib, pufferlib.vector
from pufferlib.environments import classic_control

In [2]:
num_envs = 12

In [3]:
vecenv = pufferlib.vector.make(
    classic_control.env_creator("CartPole-v1"),
    num_envs=num_envs,
    backend=pufferlib.vector.Multiprocessing,
)

In [7]:
env_rewards = [list() for _ in range(num_envs)]
env_rewards

returns = []

from itertools import count

dones = np.array([False] * num_envs)
truncateds = np.array([False] * num_envs)
dones, truncateds

obs, _ = vecenv.reset()
episodes = 0
for t in count():
    obs, rewards, dones, truncateds, _ = vecenv.step(vecenv.action_space.sample())
    for i, reward in enumerate(rewards):
        env_rewards[i].append(reward)
    for i in np.where(dones | truncateds)[0]:
        returns.append(sum(env_rewards[i]))
        env_rewards[i] = []
        episodes += 1
    if episodes >= 1_000:
        break

np.mean([np.sum(rets) for rets in returns]), t

(22.759, 1987)

In [8]:
import torch
from torch import nn

In [9]:
class QNetwork(nn.Module):
    def __init__(self, n_input, n_hiddens, n_actions):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(n_input, n_hiddens),
            nn.ReLU(),
            nn.Linear(n_hiddens, n_actions),
        )

    def forward(self, x):
        device = next(self.mlp.parameters()).device
        x = torch.tensor(np.array(x), dtype=torch.float32, device=device)
        
        return self.mlp(x)

In [13]:
import random

In [14]:
def select_actions(q_values, epsilon):
    batch_size, n_actions = q_values.shape
    if random.random() < epsilon:
        return np.random.choice(range(n_actions), batch_size)
    return np.array(q_values.argmax(dim=-1).cpu())

In [16]:
from collections import deque

In [17]:
class ReplayBuffer:
    def __init__(self, capacity: int):
        self.capacity = capacity
        self.queue = deque(maxlen=capacity)

    def store(self, transition):
        self.queue.append(transition)

    def extend(self, transitions):
        for transition in transitions:
            self.store(transition)

    def sample(self, n_samples: int):
        return [random.choice(self.queue) for _ in range(n_samples)]

    def __len__(self):
        return len(self.queue)

    def __repr__(self):
        return self.queue.__repr__()

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
from torch import optim

from itertools import count
from tqdm.auto import tqdm

In [20]:
def train(
    n_target_update_eps = 60,
    n_episodes = 8_000,
    update_batch_size = 128,
    learning_rate = 1e-4,
    epsilon_start = 1.0,
    epsilon_end = 0.0,
    epsilon_decay_percent = 0.9,
):
    net = QNetwork(n_input, n_hiddens, n_actions).to(device)
    target_net = QNetwork(n_input, n_hiddens, n_actions).to(device)
    target_net.load_state_dict(net.state_dict())
    target_net.eval()
    
    env_rewards = [list() for _ in range(num_envs)]
    env_rewards
    returns = []
    
    gamma = 1.0
    buffer_size = 100_000
    epsilon_decay = epsilon_decay_percent * n_episodes
    epsilon_decay_rate = (epsilon_start - epsilon_end) / epsilon_decay
    epsilon = epsilon_start
    
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    loss_fn = nn.SmoothL1Loss()
    buffer = ReplayBuffer(capacity=buffer_size)
    
    obs, _ = vecenv.reset()
    obs = obs.copy()
    episodes = 0
    bar = tqdm(
        desc="Episodes",
        total=n_episodes,
        initial=0,
    )
    
    debug = False
    debug_seen = False
    
    for t in count():
        with torch.no_grad():
            q_values = net(obs)
            actions = select_actions(q_values, epsilon)
    
        if debug and debug_seen:
            print("Ob seen:\t", obs[0])
            print("Action taken:\t", actions[0])
        
        next_obs, rewards, dones, truncateds, _ = vecenv.step(actions)
    
        if debug and debug_seen:
            print("Ob seen:\t", obs[0])
        buffer.extend(zip(obs.copy(), actions.copy(), rewards.copy(), next_obs.copy(), (dones | truncateds).copy()))
        obs = next_obs.copy()
    
        for i, reward in enumerate(rewards):
            env_rewards[i].append(reward)
    
        if len(buffer) >= update_batch_size:
            obs_s, actions_s, rewards_s, next_obs_s, dones_s = zip(*buffer.sample(update_batch_size))
            if debug:
                print("\nOb:\t\t", obs_s[0])
                print("Action:\t", actions_s[0])
                print("Reward:\t", rewards_s[0])
                print("Next ob:\t", next_obs_s[0])
                print("Done:\t\t", dones_s[0])
        
                print("\nBuffer length:", len(buffer))
                print("\nBuffer items:", buffer.queue[0])
    
            actions_t = torch.tensor(actions_s, device=device).unsqueeze(1)
            q_values_t = net(obs_s).gather(1, actions_t).squeeze(1)
            rewards_t = torch.tensor(rewards_s, device=device) / 500.
            dones_t = torch.tensor(dones_s, device=device)
            with torch.no_grad():
                target_q_values_t = target_net(next_obs_s).max(dim=1).values
    
            target = rewards_t + gamma * ~dones_t * target_q_values_t
            loss = loss_fn(target, q_values_t)
    
            if debug:
                print("Actions T:\t", actions_t[0])
                print("Qs T:\t\t", q_values_t[0])
                print("Target Qs:\t", target_q_values_t[0])
                print("TD Target:\t", target[0])
                with torch.no_grad():
                    td_error = target - q_values_t
                print("TD Error:\t", td_error[0])
    
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    
            if debug:
                with torch.no_grad():
                    q_values_t = net(obs_s).gather(1, actions_t).squeeze(1)
                    target_q_values_t = target_net(next_obs_s).max(dim=1).values
                    target = rewards_t + gamma * ~dones_t * target_q_values_t
                    td_error = target - q_values_t
                    
                print("Actions T:\t", actions_t[0])
                print("Qs T:\t\t", q_values_t[0])
                print("TD Target:\t", target[0])
                print("Target Qs:\t", target_q_values_t[0])
                print("TD Error:\t", td_error[0])
        
                break
            
        for i in np.where(dones | truncateds)[0]:
            returns.append(sum(env_rewards[i]))
            env_rewards[i] = []
            episodes += 1
            if epsilon >= epsilon_end:
                epsilon -= epsilon_decay_rate
            bar.update()
            
        if episodes >= n_episodes:
            bar.close()
            #debug = True
    
        if episodes % n_target_update_eps == 0:
            target_net.load_state_dict(net.state_dict())
    
    mean_last_100 = np.mean([np.sum(rets) for rets in returns[-100:]])
    max_last_100 = np.max([np.sum(rets) for rets in returns[-100:]])
    return net, mean_last_100, max_last_100

In [35]:
from carbs import CARBS
from carbs import CARBSParams
from carbs import LogSpace
from carbs import LogitSpace
from carbs import ObservationInParam
from carbs import ParamDictType
from carbs import Param

ImportError: cannot import name 'CARBS' from 'carbs' (unknown location)

In [25]:
n_target_update_eps = 60
n_episodes = 600
update_batch_size = 128
learning_rate = 1e-4
epsilon_start = 1.0
epsilon_end = 0.05
epsilon_decay_percent = 0.8



net, mean_last_100, max_last_100 = train(
    n_target_update_eps = n_target_update_eps,
    n_episodes = n_episodes,
    update_batch_size = update_batch_size,
    learning_rate = learning_rate,
    epsilon_start = epsilon_start,
    epsilon_end = epsilon_end,
    epsilon_decay_percent = epsilon_decay_percent,
)
mean_last_100, max_last_100

Episodes:   0%|          | 0/8000 [00:00<?, ?it/s]


Ob:		 [0.64082915 1.         0.12894282 0.19858733]
Action:	 1
Reward:	 1.0
Next ob:	 [ 0.6629998   1.          0.13291457 -0.05080111]
Done:		 False

Buffer length: 100000

Buffer items: (array([0.44415563, 0.73636013, 0.10414605, 0.32727414], dtype=float32), 1, 1.0, array([0.45888284, 0.9298571 , 0.11069153, 0.06916493], dtype=float32), False)
Actions T:	 tensor([1], device='cuda:0')
Qs T:		 tensor(0.4918, device='cuda:0', grad_fn=<SelectBackward0>)
Target Qs:	 tensor(0.4954, device='cuda:0')
TD Target:	 tensor(0.4974, device='cuda:0')
TD Error:	 tensor(0.0056, device='cuda:0')
Actions T:	 tensor([1], device='cuda:0')
Qs T:		 tensor(0.4920, device='cuda:0')
TD Target:	 tensor(0.4974, device='cuda:0')
Target Qs:	 tensor(0.4954, device='cuda:0')
TD Error:	 tensor(0.0054, device='cuda:0')


(110.59, 136.0)

In [26]:
import gymnasium as gym
import numpy as np
from datetime import datetime

timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
video_folder = f"./videos/DQN_scratch_CartPole_{timestamp}"

env = gym.make("CartPole-v1", render_mode="rgb_array")
env = gym.wrappers.RecordVideo(env, video_folder)

epsilon = epsilon_end

for _ in range(9):
    ob, _ = env.reset()
    ob = np.expand_dims(ob, 0)
    ret = 0
    done, truncated = False, False
    while not (done or truncated):
        with torch.no_grad():
            q_values = net(ob)
        actions = select_actions(q_values, epsilon)
        ob, reward, done, truncated, _ = env.step(actions[0])
        ob = np.expand_dims(ob, 0)
        ret += reward
    
    print(ret)

env.close()

MoviePy - Building video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-0.mp4.
MoviePy - Writing video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-0.mp4



                                                                        

MoviePy - Done !
MoviePy - video ready /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-0.mp4
107.0




MoviePy - Building video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-1.mp4.
MoviePy - Writing video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-1.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-1.mp4
107.0
112.0
107.0
114.0
109.0
105.0




109.0
MoviePy - Building video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-8.mp4.
MoviePy - Writing video /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-8.mp4



                                                              

MoviePy - Done !
MoviePy - video ready /home/fitti/projects/puffer/videos/DQN_scratch_CartPole_20250216_111825/rl-video-episode-8.mp4
102.0


