In [1]:
import gym, random
import numpy as np
import torch, os
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from collections import Counter
from collections import deque

b:\Pytorch\venv\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
b:\Pytorch\venv\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll


In [2]:
env = gym.make("CartPole-v1", render_mode='rgb_array')
state, _ = env.reset()
print("Initial state:", state)
print("Observation space:", env.observation_space)
print("Action space:", env.action_space)

Initial state: [ 0.03433528 -0.0112807  -0.00254615  0.01934811]
Observation space: Box([-4.8000002e+00 -3.4028235e+38 -4.1887903e-01 -3.4028235e+38], [4.8000002e+00 3.4028235e+38 4.1887903e-01 3.4028235e+38], (4,), float32)
Action space: Discrete(2)


In [3]:
class QNetwork(nn.Module):
    def __init__(self, num_features=4, num_actions=2, hidden_features=128) -> None:
        super().__init__()
        self.fc1 = nn.Linear(in_features=num_features, out_features=hidden_features)
        self.fc2 = nn.Linear(in_features=hidden_features, out_features=hidden_features * 2)
        self.fc3 = nn.Linear(in_features=hidden_features * 2, out_features=num_actions)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [4]:
class ReplayBuffer():
    def __init__(self, max_buffer_size=10000, batch_size=16):
        self.buffer = deque(maxlen=max_buffer_size)
        self.batch_size = batch_size
        
    def __len__(self):
        return len(self.buffer)
    
    def add_sample(self, element: tuple):
        self.buffer.append(element)
    
    def get_batch(self):
        return random.sample(self.buffer, k=self.batch_size) if len(self.buffer) > self.batch_size else list(self.buffer)

In [None]:
N_BUFFER_SIZE = 1000
N_TRAINING_STEPS = 20000
N_START_LEARNING = 1000
TARGET_UPDATE_FREQUENCY = 50
LEARNING_RATE = 5e-4
GAMMA = 0.99
BATCH_SIZE = 32
EPSILON = 0.95
DECAY = 0.9
MIN_EPSILON = 0.01
epsilon = 1.0

state_size = env.observation_space.shape[0]
action_size = env.action_space.n

q_net = QNetwork(num_features=state_size, num_actions=action_size)
target_q_net = QNetwork(num_features=state_size, num_actions=action_size)
buffer = ReplayBuffer(batch_size=BATCH_SIZE, max_buffer_size=N_BUFFER_SIZE)

target_q_net.load_state_dict(q_net.state_dict())
optimizer = torch.optim.Adam(q_net.parameters(), lr=LEARNING_RATE)

In [6]:
def select_action(state, q_net, action_size, epsilon=0.5):
    if random.random() < epsilon:
        return random.randrange(action_size)
    else:
        state_tensor = torch.from_numpy(state).float().unsqueeze(0)
        with torch.no_grad():
            logits = q_net(state_tensor)
            return torch.argmax(logits, dim=-1).item()

In [7]:
state, _ = env.reset()
episode_reward = 0
episode = 0

for step in range(N_TRAINING_STEPS):
    epsilon = max(MIN_EPSILON, epsilon * DECAY)
    action = select_action(
        state=state,
        q_net=q_net,
        action_size=action_size,
        epsilon=epsilon
    )
    
    new_state, reward, terminated, truncated, info = env.step(action=action)
    done = truncated or terminated
    episode_reward += reward
    buffer.add_sample((state, action, reward, new_state, done))
    
    if step % 100 and step < N_BUFFER_SIZE:
        print(f"Buffer filled so far: {len(buffer)}")
    
    if step > N_START_LEARNING:
        q_net.train()
        target_q_net.train()
        batch = buffer.get_batch()
        batch_len = len(batch)
        states = torch.zeros(batch_len, state_size)
        actions = torch.zeros(batch_len, 1, dtype=torch.int64)
        rewards = torch.zeros(batch_len, 1)
        next_states = torch.zeros(batch_len, state_size)
        dones = torch.zeros(batch_len, 1)
        
        for idx, i in enumerate(batch):
            states[idx] = torch.tensor(i[0], dtype=torch.float32)
            actions[idx] = torch.tensor(i[1], dtype=torch.int64)
            rewards[idx] = torch.tensor(i[2], dtype=torch.float32)
            next_states[idx] = torch.tensor(i[3], dtype=torch.float32)
            dones[idx] = torch.tensor(i[4], dtype=torch.float32)
        
        predicted_q_values = q_net(states)
        extracted_q_values = predicted_q_values.gather(1, actions).squeeze(1)
        
        with torch.no_grad():
            next_q_values = target_q_net(next_states)
            max_next_q = next_q_values.max(dim=1)[0]
            
        target = rewards.squeeze(1) + GAMMA * max_next_q * (1 - dones.squeeze(1))
        
        loss = F.mse_loss(extracted_q_values, target)
        optimizer.zero_grad()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(q_net.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        if step % TARGET_UPDATE_FREQUENCY == 0:
            target_q_net.load_state_dict(q_net.state_dict())
            print(f"[STEP]: {step}, [LOSS]: {loss.item():.4f}, [EPISODE]: {episode}, [REWARD]: {episode_reward}")
    if done:
        state, _ = env.reset()
        episode += 1
        episode_reward = 0
    else:
        state = new_state

Buffer filled so far: 2
Buffer filled so far: 3
Buffer filled so far: 4
Buffer filled so far: 5
Buffer filled so far: 6
Buffer filled so far: 7
Buffer filled so far: 8
Buffer filled so far: 9
Buffer filled so far: 10
Buffer filled so far: 11
Buffer filled so far: 12
Buffer filled so far: 13
Buffer filled so far: 14
Buffer filled so far: 15
Buffer filled so far: 16
Buffer filled so far: 17
Buffer filled so far: 18
Buffer filled so far: 19
Buffer filled so far: 20
Buffer filled so far: 21
Buffer filled so far: 22
Buffer filled so far: 23
Buffer filled so far: 24
Buffer filled so far: 25
Buffer filled so far: 26
Buffer filled so far: 27
Buffer filled so far: 28
Buffer filled so far: 29
Buffer filled so far: 30
Buffer filled so far: 31
Buffer filled so far: 32
Buffer filled so far: 33
Buffer filled so far: 34
Buffer filled so far: 35
Buffer filled so far: 36
Buffer filled so far: 37
Buffer filled so far: 38
Buffer filled so far: 39
Buffer filled so far: 40
Buffer filled so far: 41
Buffer f

In [None]:
def evaluate_model(q_net, env, num_episodes=10, render=False, record_video=False, video_folder="/RL/videos"):
    q_net.eval()  # Set to evaluation mode
    
    episode_rewards = []
    episode_lengths = []
    success_count = 0  

    # If recording video, wrap the environment.
    # We need a new env instance for the wrapper to work correctly each time
    # This ensures render_mode="rgb_array" is set for video capture.
    if record_video:
        # Create the video folder if it doesn't exist
        if not os.path.exists(video_folder):
            os.makedirs(video_folder, exist_ok=True)
            
        # Wrap this new environment for video recording
        env = gym.wrappers.RecordVideo(env, video_folder=video_folder, episode_trigger=lambda x: True) # Record all episodes
        print(f"Recording videos to: {video_folder}")
    elif render:
        pass

    for episode in range(num_episodes):
        # The env.reset() method now returns (observation, info) in Gymnasium
        state, info = env.reset()
        episode_reward = 0
        steps = 0
        done = False
        
        while not done:
            # Only render for human viewing if not recording video,
            # as RecordVideo handles its own internal rgb_array rendering.
            if render and not record_video: 
                try:
                    env.render()
                except Exception as e:
                    print(f"Warning: Could not render environment. Error: {e}")
            
            # Select action greedily (no exploration)
            with torch.no_grad():
                # Ensure state is a float tensor and has a batch dimension
                state_tensor = torch.from_numpy(state).float().unsqueeze(0)
                q_values = q_net(state_tensor)
                action = torch.argmax(q_values, dim=-1).item()
            
            # Gymnasium step returns (observation, reward, terminated, truncated, info)
            state, reward, terminated, truncated, info = env.step(action)
            done = terminated or truncated # An episode is done if it's terminated or truncated
            episode_reward += reward
            steps += 1
        
        episode_rewards.append(episode_reward)
        episode_lengths.append(steps)
        
        # CartPole is considered "solved" if it can balance for 195+ steps
        if steps >= 195:
            success_count += 1
            
        if episode % 20 == 0:
            print(f"Episode {episode}: Reward = {episode_reward}, Steps = {steps}")
    
    # Crucially, close the environment to finalize video recording if a wrapper was used,
    # or just to clean up resources for the original environment.
    env.close() 
    
    # If the environment was wrapped for video recording, the original_env still needs to be closed
    if record_video:
        env.close()

    # Calculate metrics
    metrics = {
        'mean_reward': np.mean(episode_rewards),
        'std_reward': np.std(episode_rewards),
        'min_reward': np.min(episode_rewards),
        'max_reward': np.max(episode_rewards),
        'mean_length': np.mean(episode_lengths),
        'std_length': np.std(episode_lengths),
        'success_rate': success_count / num_episodes,
        'episode_rewards': episode_rewards, # Raw list of rewards
        'episode_lengths': episode_lengths  # Raw list of lengths
    }
    
    return metrics

In [9]:
evaluate_model(q_net, env, record_video=True)

Creating DIR
Recording videos to: B:/Pytorch/RL/videos


  logger.warn(


MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-0.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-0.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-0.mp4
Episode 0: Reward = 500.0, Steps = 500
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-1.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-1.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-1.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-2.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-2.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-2.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-3.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-3.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-3.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-4.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-4.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-4.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-5.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-5.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-5.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-6.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-6.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-6.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-7.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-7.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-7.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-8.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-8.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-8.mp4
MoviePy - Building video B:\Pytorch\RL\videos\rl-video-episode-9.mp4.
MoviePy - Writing video B:\Pytorch\RL\videos\rl-video-episode-9.mp4



                                                                          

MoviePy - Done !
MoviePy - video ready B:\Pytorch\RL\videos\rl-video-episode-9.mp4


{'mean_reward': 500.0,
 'std_reward': 0.0,
 'min_reward': 500.0,
 'max_reward': 500.0,
 'mean_length': 500.0,
 'std_length': 0.0,
 'success_rate': 1.0,
 'episode_rewards': [500.0,
  500.0,
  500.0,
  500.0,
  500.0,
  500.0,
  500.0,
  500.0,
  500.0,
  500.0],
 'episode_lengths': [500, 500, 500, 500, 500, 500, 500, 500, 500, 500]}