In [2]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 0.0002
gamma = 0.98

In [3]:
class Policy_net(nn.Module):
    def __init__(self):
        super(Policy_net, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim = 0)

        return x

In [4]:
class episode_memory():
    def __init__(self):
        self.data = []
    
    def put_data(self, item):
        self.data.append(item)


In [5]:
pi = Policy_net()
memory = episode_memory()
optimizer = optim.Adam(pi.parameters(), lr  = learning_rate)

In [6]:
def train_net():
    R = 0
    optimizer .zero_grad()
    total_loss = 0
    for r, prob in memory.data[::-1]:
        R = r + gamma * R
        loss = -torch.log(prob) * R
        total_loss += loss

    total_loss.backward() # total_loss 한번에 업데이트
    optimizer.step()
    memory.data = []

In [8]:
env = gym.make("CartPole-v1")
score = 0.0
print_interval = 100

for n_epi in range(2000):
    s, _ = env.reset()
    done = False

    while not done:
        prob = pi(torch.from_numpy(s).float())
        a = np.random.choice(a = 2, size = 1, p = prob.detach().numpy()).squeeze()
        s_prime, r, done, truncated, info = env.step(a)
        memory.put_data((r, prob[a]))
        s = s_prime
        score += r
    
    train_net()

    if n_epi % print_interval == 0 and n_epi != 0:
        print(f"# of Episode : {n_epi}, avg_score {score / print_interval}")
        score = 0.0

env.close()

# of Episode : 100, avg_score 43.65
# of Episode : 200, avg_score 58.1
# of Episode : 300, avg_score 60.9
# of Episode : 400, avg_score 62.11
# of Episode : 500, avg_score 94.74
# of Episode : 600, avg_score 120.83
# of Episode : 700, avg_score 148.0
# of Episode : 800, avg_score 148.24
# of Episode : 900, avg_score 195.58
# of Episode : 1000, avg_score 194.41
# of Episode : 1100, avg_score 232.82
# of Episode : 1200, avg_score 265.55
# of Episode : 1300, avg_score 273.33
# of Episode : 1400, avg_score 289.38
# of Episode : 1500, avg_score 322.11
# of Episode : 1600, avg_score 322.62
# of Episode : 1700, avg_score 343.44
# of Episode : 1800, avg_score 342.45
# of Episode : 1900, avg_score 397.2


In [9]:
import time
env = gym.make("CartPole-v1", render_mode = "human")
state, info = env.reset()

for i in range(300):
    prob = pi(torch.from_numpy(state).float())
    a = np.random.choice(a = 2, size = 1, p = prob.detach().numpy()).squeeze()
    state, reward, terminated, truncated, info = env.step(a)

    env.render()
    time.sleep(0.01)

    if terminated:
        state, info = env.reset()
env.close()