In [1]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 0.0002
gamma = 0.98

In [2]:
class Policy_net(nn.Module):
    def __init__(self):
        super(Policy_net, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim = 0)

        return x

In [3]:
class episode_memory():
    def __init__(self):
        self.data = []
    
    def put_data(self, item):
        self.data.append(item)


In [4]:
pi = Policy_net()
memory = episode_memory()
optimizer = optim.Adam(pi.parameters(), lr  = learning_rate)

In [5]:
def train_net():
    R = 0
    optimizer .zero_grad()
    total_loss = 0
    for r, prob in memory.data[::-1]:
        R = r + gamma * R
        loss = -torch.log(prob) * R
        total_loss += loss

    total_loss.backward() # total_loss 한번에 업데이트
    optimizer.step()
    memory.data = []

In [6]:
env = gym.make("CartPole-v1")
score = 0.0
print_interval = 100

for n_epi in range(10000):
    s, _ = env.reset()
    done = False

    while not done:
        prob = pi(torch.from_numpy(s).float())
        a = np.random.choice(a = 2, size = 1, p = prob.detach().numpy()).squeeze()
        s_prime, r, done, truncated, info = env.step(a)
        memory.put_data((r, prob[a]))
        s = s_prime
        score += r
    
    train_net()

    if n_epi % print_interval == 0 and n_epi != 0:
        print(f"# of Episode : {n_epi}, avg_score {score / print_interval}")
        score = 0.0

env.close()

# of Episode : 100, avg_score 24.81
# of Episode : 200, avg_score 31.99
# of Episode : 300, avg_score 37.19
# of Episode : 400, avg_score 45.23
# of Episode : 500, avg_score 54.51
# of Episode : 600, avg_score 68.48
# of Episode : 700, avg_score 84.18
# of Episode : 800, avg_score 108.14
# of Episode : 900, avg_score 144.7
# of Episode : 1000, avg_score 181.58
# of Episode : 1100, avg_score 189.56
# of Episode : 1200, avg_score 208.06
# of Episode : 1300, avg_score 211.81


KeyboardInterrupt: 

In [8]:
import time
env = gym.make("CartPole-v1", render_mode = "human")
state, info = env.reset()

for i in range(300):
    prob = pi(torch.from_numpy(state).float())
    a = np.random.choice(a = 2, size = 1, p = prob.detach().numpy()).squeeze()
    state, reward, terminated, truncated, info = env.step(a)

    env.render()
    time.sleep(0.01)

    if terminated:
        state, info = env.reset()
env.close()

In [13]:
memory.data

[(1.0, tensor(0.5038, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.8103, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.5362, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7897, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.5643, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7704, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.5883, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7523, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.6083, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.2646, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.9240, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7341, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.6221, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7284, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.3724, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.8780, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.6479, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.7009, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.6594, grad_fn=<SelectBackward0>)),
 (1.0, tensor(0.6890, grad_fn=<