In [2]:
import gymnasium as gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
learning_rate = 0.0002
gamma = 0.98

In [3]:
class Policy_net(nn.Module):
    def __init__(self):
        super(Policy_net, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(8, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 4)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x

In [4]:
class episode_memory():
    def __init__(self):
        self.data = []
    
    def put_data(self, item):
        self.data.append(item)


In [5]:
pi = Policy_net()
memory = episode_memory()
optimizer = optim.Adam(pi.parameters(), lr  = learning_rate)

In [6]:
def train_net():
    R = 0
    optimizer .zero_grad()
    total_loss = 0
    for r, prob in memory.data[::-1]:
        R = r + gamma * R
        loss = -torch.log(prob) * R
        total_loss += loss

    total_loss.backward() # total_loss 한번에 업데이트
    optimizer.step()
    memory.data = []

In [7]:
env = gym.make("LunarLander-v3", continuous=False, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5)
score = 0.0
print_interval = 100

for n_epi in range(2000):
    s, _ = env.reset()
    done = False

    while not done:
        prob = pi(torch.from_numpy(s).float())
        prob = F.softmax(prob, dim=-1)  # 소프트맥스 적용
        a = np.random.choice(a = 4, size = 1, p = prob.detach().numpy()).squeeze()
        s_prime, r, done, truncated, info = env.step(a)
        memory.put_data((r, prob[a]))
        s = s_prime
        score += r
    
    train_net()

    if n_epi % print_interval == 0 and n_epi != 0:
        print(f"# of Episode : {n_epi}, avg_score {score / print_interval}")
        score = 0.0

env.close()

# of Episode : 100, avg_score -188.88330634038357
# of Episode : 200, avg_score -192.14610825724733
# of Episode : 300, avg_score -171.1451542648543
# of Episode : 400, avg_score -154.14753394698337
# of Episode : 500, avg_score -150.47231911675846
# of Episode : 600, avg_score -136.10382780752877
# of Episode : 700, avg_score -208.41212380023507
# of Episode : 800, avg_score -136.77742686822322
# of Episode : 900, avg_score -127.05655944567269
# of Episode : 1000, avg_score -122.49306234675922
# of Episode : 1100, avg_score -127.58162893277377
# of Episode : 1200, avg_score -119.96953826384751
# of Episode : 1300, avg_score -114.9069602236778
# of Episode : 1400, avg_score -118.44639814193393
# of Episode : 1500, avg_score -117.19628687896316
# of Episode : 1600, avg_score -110.17599420126668
# of Episode : 1700, avg_score -136.64870181144295
# of Episode : 1800, avg_score -112.9949714189574
# of Episode : 1900, avg_score -221.4893500818093


In [9]:
import time
env = gym.make("LunarLander-v3", render_mode = "human", continuous=False, gravity=-10.0, enable_wind=False, wind_power=15.0, turbulence_power=1.5)
state, info = env.reset()

for i in range(500):
    prob = pi(torch.from_numpy(state).float())
    prob = F.softmax(prob, dim=-1)  # 소프트맥스 적용
    a = np.random.choice(a = 4, size = 1, p = prob.detach().numpy()).squeeze()
    state, reward, terminated, truncated, info = env.step(a)

    env.render()
    time.sleep(0.01)

    if terminated:
        state, info = env.reset()
env.close()