In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
import cv2
import numpy as np
import os

# **DQN 실행**

In [2]:
class Qnet(nn.Module):
    def __init__(self):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, 2)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else :
            return out.argmax().item()

In [4]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='human')
q = Qnet()
q.load_state_dict(torch.load("./cartpole_pth/dqn_cartpole300_.pth"))
print("Model loaded from dqn_cartpole.pth")

s, _ = env.reset()
done = False
total_reward = 0

while not done:
    a = q.sample_action(torch.from_numpy(s).float(), epsilon=0.0)
    s, r, done, truncated, info = env.step(a)
    total_reward += r
    if done or truncated:
        break

print("Total reward achieved: {:.1f}".format(total_reward))
env.close()

  q.load_state_dict(torch.load("./cartpole_pth/dqn_cartpole300_.pth"))


Model loaded from dqn_cartpole.pth
Total reward achieved: 452.0


In [42]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='rgb_array')

q = Qnet()
q.load_state_dict(torch.load("./cartpole_pth/dqn_cartpole500_.pth"))
print("Model loaded from dqn_cartpole500_.pth")
s, _ = env.reset(seed=32)
done = False
total_reward = 0

output_folder = "./videos"  # 저장할 폴더 경로
os.makedirs(output_folder, exist_ok=True)  # 폴더가 없으면 생성
video_filename = os.path.join(output_folder, "dqn_cartpole.avi")  # 파일 경로 설정

frame_width, frame_height = env.render().shape[1], env.render().shape[0]
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 코덱 설정 (XVID, MP4V 등)
fps = 30  # 초당 프레임 수
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))

# 에피소드 실행
while not done:
    a = q.sample_action(torch.from_numpy(s).float(), epsilon=0.0)
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    # 현재 프레임을 저장
    frame = env.render()
    video_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV는 BGR 포맷 사용
    video_writer.write(video_frame)

    if done or truncated:
        break

# 리소스 정리
env.close()
video_writer.release()
print("Total reward achieved: {:.1f}".format(total_reward))
print(f"Video saved as {video_filename}")

  q.load_state_dict(torch.load("./cartpole_pth/dqn_cartpole500_.pth"))


Model loaded from dqn_cartpole500_.pth
Total reward achieved: 1000.0
Video saved as ./videos\dqn_cartpole.avi


## REINFORCE 실행

In [24]:
#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

In [25]:
class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4, 128)
        self.fc2 = nn.Linear(128, 2)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss.backward()
        self.optimizer.step()
        self.data = []

In [40]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='human')
pi = Policy()
pi.load_state_dict(torch.load("./cartpole_pth/reinforce_cartpole500_.pth")) 
print("Model loaded from reinforce_cartpole.pth")

s, _ = env.reset()
done = False
total_reward = 0

while not done:
    prob = pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    if done or truncated:
        break

print("Total reward achieved: {:.1f}".format(total_reward))
env.close()

  pi.load_state_dict(torch.load("./cartpole_pth/reinforce_cartpole500_.pth"))


Model loaded from reinforce_cartpole.pth
Total reward achieved: 1000.0


In [41]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='rgb_array')

pi = Policy()
pi.load_state_dict(torch.load("./cartpole_pth/reinforce_cartpole500_.pth")) 
print("Model loaded from reinforce_cartpole.pth")

s, _ = env.reset(seed=32)
done = False
total_reward = 0

output_folder = "./videos"  # 저장할 폴더 경로
os.makedirs(output_folder, exist_ok=True)  # 폴더가 없으면 생성
video_filename = os.path.join(output_folder, "reinforce_cartpole.avi")  # 파일 경로 설정

frame_width, frame_height = env.render().shape[1], env.render().shape[0]
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 코덱 설정 (XVID, MP4V 등)
fps = 30  # 초당 프레임 수
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))

# 에피소드 실행
while not done:
    prob = pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    # 현재 프레임을 저장
    frame = env.render()
    video_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV는 BGR 포맷 사용
    video_writer.write(video_frame)

    if done or truncated:
        break

# 리소스 정리
env.close()
video_writer.release()
print("Total reward achieved: {:.1f}".format(total_reward))
print(f"Video saved as {video_filename}")

Model loaded from reinforce_cartpole.pth


  pi.load_state_dict(torch.load("./cartpole_pth/reinforce_cartpole500_.pth"))


Total reward achieved: 1000.0
Video saved as ./videos\reinforce_cartpole.avi


## PPO 실행

In [43]:
#Hyperparameters
learning_rate = 0.0005
gamma         = 0.98
lmbda         = 0.95
eps_clip      = 0.1
K_epoch       = 3
T_horizon     = 20

In [44]:
class PPO(nn.Module):
    def __init__(self):
        super(PPO, self).__init__()
        self.data = []

        self.fc1   = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v  = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, prob_a_lst, done_lst = [], [], [], [], [], []
        for transition in self.data:
            s, a, r, s_prime, prob_a, done = transition

            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r])
            s_prime_lst.append(s_prime)
            prob_a_lst.append([prob_a])
            done_mask = 0 if done else 1
            done_lst.append([done_mask])

        s,a,r,s_prime,done_mask, prob_a = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                          torch.tensor(r_lst), torch.tensor(s_prime_lst, dtype=torch.float), \
                                          torch.tensor(done_lst, dtype=torch.float), torch.tensor(prob_a_lst)
        self.data = []
        return s, a, r, s_prime, done_mask, prob_a

    def train_net(self):
        s, a, r, s_prime, done_mask, prob_a = self.make_batch()

        for i in range(K_epoch):
            td_target = r + gamma * self.v(s_prime) * done_mask
            delta = td_target - self.v(s)
            delta = delta.detach().numpy()

            advantage_lst = []
            advantage = 0.0
            for delta_t in delta[::-1]:
                advantage = gamma * lmbda * advantage + delta_t[0]
                advantage_lst.append([advantage])
            advantage_lst.reverse()
            advantage = torch.tensor(advantage_lst, dtype=torch.float)

            pi = self.pi(s, softmax_dim=1)
            pi_a = pi.gather(1,a)
            ratio = torch.exp(torch.log(pi_a) - torch.log(prob_a))  # a/b == exp(log(a)-log(b))

            surr1 = ratio * advantage
            surr2 = torch.clamp(ratio, 1-eps_clip, 1+eps_clip) * advantage
            loss = -torch.min(surr1, surr2) + F.smooth_l1_loss(self.v(s) , td_target.detach())

            self.optimizer.zero_grad()
            loss.mean().backward()
            self.optimizer.step()

In [46]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='human')
ppo = PPO()
ppo.load_state_dict(torch.load("./cartpole_pth/ppo_cartpole5002_.pth"))  # 모델 불러오기
print("Model loaded from ppo_cartpole.pth")

s, _ = env.reset()
done = False
total_reward = 0

while not done:
    prob = ppo.pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  # 가장 높은 확률의 행동 선택
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    if done or truncated:
        break

print(f"Total reward achieved: {total_reward}")
env.close()

  ppo.load_state_dict(torch.load("./cartpole_pth/ppo_cartpole5001_.pth"))  # 모델 불러오기


Model loaded from ppo_cartpole.pth
Total reward achieved: 1000.0


In [47]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='rgb_array')

ppo = PPO()
ppo.load_state_dict(torch.load("./cartpole_pth/ppo_cartpole5002_.pth"))  # 모델 불러오기
print("Model loaded from ppo_cartpole.pth")

s, _ = env.reset(seed=32)
done = False
total_reward = 0

output_folder = "./videos"  # 저장할 폴더 경로
os.makedirs(output_folder, exist_ok=True)  # 폴더가 없으면 생성
video_filename = os.path.join(output_folder, "ppo_cartpole.avi")  # 파일 경로 설정

frame_width, frame_height = env.render().shape[1], env.render().shape[0]
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 코덱 설정 (XVID, MP4V 등)
fps = 30  # 초당 프레임 수
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))

# 에피소드 실행
while not done:
    prob = ppo.pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  # 가장 높은 확률의 행동 선택
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    # 현재 프레임을 저장
    frame = env.render()
    video_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV는 BGR 포맷 사용
    video_writer.write(video_frame)

    if done or truncated:
        break

# 리소스 정리
env.close()
video_writer.release()
print("Total reward achieved: {:.1f}".format(total_reward))
print(f"Video saved as {video_filename}")

  ppo.load_state_dict(torch.load("./cartpole_pth/ppo_cartpole5002_.pth"))  # 모델 불러오기


Model loaded from ppo_cartpole.pth
Total reward achieved: 1000.0
Video saved as ./videos\ppo_cartpole.avi


## Actor_Critirc 모델 실행

In [48]:
class ActorCritic(nn.Module):
    def __init__(self):
        super(ActorCritic, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def pi(self, x, softmax_dim = 0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst, a_lst, r_lst, s_prime_lst, done_lst = [], [], [], [], []
        for transition in self.data:
            s,a,r,s_prime,done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_lst.append([done_mask])

        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst), \
                                                               torch.tensor(r_lst, dtype=torch.float), torch.tensor(s_prime_lst, dtype=torch.float), \
                                                               torch.tensor(done_lst, dtype=torch.float)
        self.data = []
        return s_batch, a_batch, r_batch, s_prime_batch, done_batch

    def train_net(self):
        s, a, r, s_prime, done = self.make_batch()
        td_target = r + gamma * self.v(s_prime) * done
        delta = td_target - self.v(s)

        pi = self.pi(s, softmax_dim=1)
        pi_a = pi.gather(1,a)
        loss = -torch.log(pi_a) * delta.detach() + F.smooth_l1_loss(self.v(s), td_target.detach())

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()

In [49]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='human')
model = ActorCritic()
model.load_state_dict(torch.load("./cartpole_pth/actor_cartpole.pth"))  # 저장된 모델 불러오기
print("Model loaded from actor_critic_cartpole.pth")

s, _ = env.reset()
done = False
total_reward = 0

while not done:
    prob = model.pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  # 가장 높은 확률의 행동 선택
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    if done or truncated:
        break

print(f"Total reward achieved: {total_reward}")
env.close()


  model.load_state_dict(torch.load("./cartpole_pth/actor_cartpole.pth"))  # 저장된 모델 불러오기


Model loaded from actor_critic_cartpole.pth
Total reward achieved: 1000.0


In [50]:
env = gym.make('CartPole-v1', max_episode_steps=1000, render_mode='rgb_array')

model = ActorCritic()
model.load_state_dict(torch.load("./cartpole_pth/actor_cartpole.pth"))  # 저장된 모델 불러오기
print("Model loaded from actor_critic_cartpole.pth")

s, _ = env.reset(seed=32)
done = False
total_reward = 0

output_folder = "./videos"  # 저장할 폴더 경로
os.makedirs(output_folder, exist_ok=True)  # 폴더가 없으면 생성
video_filename = os.path.join(output_folder, "actor_cartpole.avi")  # 파일 경로 설정

frame_width, frame_height = env.render().shape[1], env.render().shape[0]
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 코덱 설정 (XVID, MP4V 등)
fps = 30  # 초당 프레임 수
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))

# 에피소드 실행
while not done:
    prob = ppo.pi(torch.from_numpy(s).float())
    a = torch.argmax(prob).item()  # 가장 높은 확률의 행동 선택
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    # 현재 프레임을 저장
    frame = env.render()
    video_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV는 BGR 포맷 사용
    video_writer.write(video_frame)

    if done or truncated:
        break

# 리소스 정리
env.close()
video_writer.release()
print("Total reward achieved: {:.1f}".format(total_reward))
print(f"Video saved as {video_filename}")

  model.load_state_dict(torch.load("./cartpole_pth/actor_cartpole.pth"))  # 저장된 모델 불러오기


Model loaded from actor_critic_cartpole.pth
Total reward achieved: 1000.0
Video saved as ./videos\actor_cartpole.avi
