In [13]:
import keyboard
from itertools import count

import numpy as np
import gymnasium as gym

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device(
    "cuda" if torch.cuda.is_available() else
    "mps" if torch.backends.mps.is_available() else
    "cpu"
)

# Random action

In [6]:
env = gym.make("CartPole-v1", render_mode="human")
env.reset()

for t in range(100):
    action = env.action_space.sample()
    (next_state, reward, done, truncated, info) = env.step(action)
    print(f"Next State: {next_state} Action: {action} Reward: {reward}")
 
    if done:
        break

env.close()

Next State: [ 0.04549076 -0.20544994  0.04736772  0.32751587] Action: 0 Reward: 1.0
Next State: [ 0.04138176 -0.01103323  0.05391804  0.05013882] Action: 1 Reward: 1.0
Next State: [ 0.04116109  0.18327579  0.05492081 -0.22505693] Action: 1 Reward: 1.0
Next State: [ 0.04482661  0.37757155  0.05041968 -0.49992254] Action: 1 Reward: 1.0
Next State: [ 0.05237804  0.18177642  0.04042123 -0.19178595] Action: 0 Reward: 1.0
Next State: [ 0.05601357 -0.01389977  0.03658551  0.11336919] Action: 0 Reward: 1.0
Next State: [ 0.05573557 -0.20952633  0.03885289  0.41736642] Action: 0 Reward: 1.0
Next State: [ 0.05154505 -0.40517673  0.04720022  0.7220404 ] Action: 0 Reward: 1.0
Next State: [ 0.04344151 -0.21073838  0.06164103  0.4445794 ] Action: 1 Reward: 1.0
Next State: [ 0.03922674 -0.01654025  0.07053261  0.17194644] Action: 1 Reward: 1.0
Next State: [ 0.03889594 -0.2125971   0.07397154  0.4860199 ] Action: 0 Reward: 1.0
Next State: [ 0.034644   -0.01859265  0.08369194  0.21753684] Action: 1 Rewa

# Custom environment

In [7]:
class CustomCartPole(gym.Env):
    def __init__(self):
        self.env = gym.make("CartPole-v1", render_mode="human")
        self.env._max_episode_steps = 300
        self.new_x_limit = 10.0
        self.new_theta_limit = 0.7

    def reset(self, **kwargs):
        return self.env.reset(**kwargs)

    def step(self, action):
        obs, reward, done, truncated, info = self.env.step(action)

        x, x_dot, theta, theta_dot = obs

        if abs(x) > self.new_x_limit or abs(theta) > self.new_theta_limit:
            done = True

        return obs, reward, done, truncated, info

    def render(self):
        return self.env.render()

    def close(self):
        self.env.close()

# Balancing based on angel

In [8]:
env = CustomCartPole()

state, info = env.reset()

for t in range(300):

    if state[3] < 0:
        action = 0
    else:
        action = 1

    (state, reward, done, truncated, info) = env.step(action)
    print(f"State: {state} Action: {action} Reward: {reward}")
    
    if done:
        break

env.close()

State: [-0.04851669 -0.1758074   0.0155233   0.26271942] Action: 0 Reward: 1.0
State: [-0.05203284  0.01908956  0.02077769 -0.02502714] Action: 1 Reward: 1.0
State: [-0.05165105 -0.1763241   0.02027715  0.2741383 ] Action: 0 Reward: 1.0
State: [-0.05517753  0.01850275  0.02575991 -0.01208084] Action: 1 Reward: 1.0
State: [-0.05480748 -0.17697898  0.0255183   0.28861704] Action: 0 Reward: 1.0
State: [-0.05834706  0.01776996  0.03129064  0.00409032] Action: 1 Reward: 1.0
State: [-0.05799166  0.21242951  0.03137245 -0.27855814] Action: 1 Reward: 1.0
State: [-0.05374307  0.01687438  0.02580128  0.02385221] Action: 0 Reward: 1.0
State: [-0.05340558  0.211617    0.02627833 -0.26057976] Action: 1 Reward: 1.0
State: [-0.04917324  0.01612996  0.02106673  0.04027449] Action: 0 Reward: 1.0
State: [-0.04885064  0.21094358  0.02187222 -0.24568793] Action: 1 Reward: 1.0
State: [-0.04463177  0.01551618  0.01695846  0.05381298] Action: 0 Reward: 1.0
State: [-0.04432144  0.21039093  0.01803472 -0.23347

In [14]:
class DQN(nn.Module):
    def __init__(self, n_observations, n_actions):
        super(DQN, self).__init__()
        
        hidden_sizes = [256, 128, 64]
        layers = []
        
        layers.append(nn.Linear(n_observations, hidden_sizes[0]))
        layers.append(nn.ReLU())
        
        for i in range(len(hidden_sizes) - 1):
            layers.append(nn.Linear(hidden_sizes[i], hidden_sizes[i + 1]))
            layers.append(nn.ReLU())
        
        layers.append(nn.Linear(hidden_sizes[-1], n_actions))

        self.model = nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [15]:
env = gym.make("CartPole-v1", render_mode="human")
env.unwrapped.theta_threshold_radians = np.deg2rad(45)
state, info = env.reset()
n_observations = len(state)
n_actions = env.action_space.n

policy_net = DQN(n_observations, n_actions).to(device)
policy_net.load_state_dict(torch.load("cartpole_policy_8.pth", map_location=torch.device('cpu'), weights_only=False))

<All keys matched successfully>

In [None]:
env = gym.make("CartPole-v1", render_mode="human")
env.unwrapped.theta_threshold_radians = np.deg2rad(45)
state, info = env.reset()
state = torch.tensor(state, dtype=torch.float32, device="cpu").unsqueeze(0)

for t in count():
    action = policy_net(state).max(1).indices.view(1, 1)

    if keyboard.is_pressed("left"):
        action = torch.tensor([[0]], dtype=torch.int64)
    elif keyboard.is_pressed("right"):
        action = torch.tensor([[1]], dtype=torch.int64)

    observation, reward, terminated, truncated, _ = env.step(action.item())
    state = torch.tensor(observation, dtype=torch.float32, device="cpu").unsqueeze(0)
    
    env.render()

    if terminated:
        print('Iteration: ', t+1, ' steps')
        break

env.close()

AttributeError: 'int' object has no attribute 'item'

: 