In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import matplotlib.pyplot as plt
import cv2
import numpy as np
import os

In [2]:
learning_rate = 0.001  # 0.005 -> 0.001
gamma = 0.98
buffer_limit = 50000
batch_size = 64       # 32 -> 64

In [9]:
class Qnet(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def sample_action(self, obs, epsilon):
        out = self.forward(obs)
        coin = random.random()
        if coin < epsilon:
            return random.randint(0,1)
        else :
            return out.argmax().item()

In [10]:
env = gym.make("MountainCar-v0", render_mode="human")  # human 렌더링 활성화
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# 네트워크 생성 및 파라미터 로드
q = Qnet(state_dim, action_dim)
q.load_state_dict(torch.load("./mountaincar_pth/dqn_mcar370__.pth"))
q.eval()

for episode in range(5):  # 5회 반복 실행
    state, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
    # 상태를 텐서로 변환
        state_tensor = torch.FloatTensor(state).unsqueeze(0)

        # 최적 행동 선택
        with torch.no_grad():
            action = q(state_tensor).argmax().item()

        # 환경에서 행동 수행
        next_state, reward, done, truncated, info = env.step(action)
        total_reward += reward
        state = next_state

        if done or truncated:
            print(f"Episode {episode + 1} finished with total reward: {total_reward}")
            break

env.close()

  q.load_state_dict(torch.load("./mountaincar_pth/dqn_mcar370__.pth"))


Episode 1 finished with total reward: -117.0
Episode 2 finished with total reward: -117.0
Episode 3 finished with total reward: -118.0
Episode 4 finished with total reward: -116.0
Episode 5 finished with total reward: -115.0


In [3]:
env = gym.make('MountainCar-v0', render_mode='rgb_array')

q = Qnet(state_dim, action_dim)
q.load_state_dict(torch.load("./mountaincar_pth/dqn_mcar370__.pth"))
s, _ = env.reset(seed=32)
done = False
total_reward = 0

output_folder = "./videos"  # 저장할 폴더 경로
os.makedirs(output_folder, exist_ok=True)  # 폴더가 없으면 생성
video_filename = os.path.join(output_folder, "dqn_mountainCar.avi")  # 파일 경로 설정

frame_width, frame_height = env.render().shape[1], env.render().shape[0]
fourcc = cv2.VideoWriter_fourcc(*'XVID')  # 코덱 설정 (XVID, MP4V 등)
fps = 30  # 초당 프레임 수
video_writer = cv2.VideoWriter(video_filename, fourcc, fps, (frame_width, frame_height))

# 에피소드 실행
while not done:
    a = q.sample_action(torch.from_numpy(s).float(), epsilon=0.0)
    s, r, done, truncated, info = env.step(a)
    total_reward += r

    # 현재 프레임을 저장
    frame = env.render()
    video_frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)  # OpenCV는 BGR 포맷 사용
    video_writer.write(video_frame)

    if done or truncated:
        break

# 리소스 정리
env.close()
video_writer.release()
print("Total reward achieved: {:.1f}".format(total_reward))
print(f"Video saved as {video_filename}")

NameError: name 'Qnet' is not defined

## REINFORCE

In [4]:
#Hyperparameters
learning_rate = 0.0002
gamma         = 0.98

In [5]:
class Policy(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Policy, self).__init__()
        self.data = []

        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, output_dim)
        self.optimizer = optim.Adam(self.parameters(), lr=learning_rate)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.softmax(self.fc2(x), dim=0)
        return x

    def put_data(self, item):
        self.data.append(item)

    def train_net(self):
        R = 0
        self.optimizer.zero_grad()
        for r, prob in self.data[::-1]:
            R = r + gamma * R
            loss = -torch.log(prob) * R
            loss.backward()
        self.optimizer.step()
        self.data = []

In [10]:
env = gym.make("MountainCar-v0", render_mode="human")  # human 렌더링 활성화
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.n

# 네트워크 생성 및 파라미터 로드
pi = Policy(state_dim, action_dim)
pi.load_state_dict(torch.load("./mountaincar_pth/reinforce_mcar11710.pth")) 


for episode in range(5):  # 5회 반복 실행
    s, _ = env.reset()
    done = False
    total_reward = 0

    while not done:
        prob = pi(torch.from_numpy(s).float())
        a = torch.argmax(prob).item()  
        s, r, done, truncated, info = env.step(a)
        total_reward += r

        if done or truncated:
            print(f"Episode {episode + 1} finished with total reward: {total_reward}")
            break

env.close()

  pi.load_state_dict(torch.load("./mountaincar_pth/reinforce_mcar11710.pth"))


Episode 1 finished with total reward: -200.0
Episode 2 finished with total reward: -200.0
Episode 3 finished with total reward: -200.0
Episode 4 finished with total reward: -200.0
Episode 5 finished with total reward: -200.0
