In [1]:
import os
import sys
import argparse
import json
import gym
import cv2
import numpy as np
def rgb2gray(rgb):

    r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b

    return gray

In [2]:
def preprocess( screen):
    preprocessed= cv2.resize(screen, (84,84))  # 84 * 84 로 변경
    preprocessed = np.dot(preprocessed[..., :3], [0.299, 0.587, 0.114])  # Gray scale 로 변경
    # preprocessed: np.array = preprocessed.transpose((2, 0, 1))  # (C, W, H) 로 변경
    preprocessed = preprocessed.astype('float32') / 255.

    return torch.tensor(preprocessed)

In [3]:
import os
import sys
import argparse
import json
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from utils.replay_memory import ReplayBuffer
from utils.save_tensorboard import *
from models.dqn_image import DQN as Qnet

device='cuda'


# Hyperparameters
with open("configs/dqn.json", "r") as config_json:
    config = json.load(config_json)

learning_rate = config['learning_rate']
gamma = config['gamma']
buffer_limit = 10000
batch_size = config['batch_size']
n_episodes = config['n_episodes']
min_mem_size = config['min_mem_size']

def main():
    env = gym.make('CartPole-v1')
    Summary_Writer=mk_SummaryWriter("experiments",'DQN')
    q = Qnet().to(device)
    q_target = Qnet().to(device)
    q_target.load_state_dict(q.state_dict())
    memory = ReplayBuffer(buffer_limit, device)

    print_interval = 20
    score = 0.0
    max_score = -9999
    optimizer = optim.Adam(q.parameters(), lr=learning_rate)

    for n_epi in range(n_episodes):
        epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200))  # Linear annealing from 8% to 1%
        s = env.reset()
        done = False
        height,width=84,84
        history_initial=torch.zeros(4,height,width)
        while not done:
            a = q.sample_action(history_initial.float().to(device).unsqueeze(0), epsilon)

            s_prime, r, done, info = env.step(a)

            s_image=env.render(mode='rgb_array')

            s_image=preprocess( s_image)
            history_new=torch.cat((s_image.unsqueeze(0),history_initial[1:4]),0)    
            
            

            done_mask = 0.0 if done else 1.0
            memory.put((history_initial, a, r / 100.0, history_new, done_mask))
            history_initial=history_new
            s = s_prime
            score += r
            if done:
                break
            if max_score < score:
                max_score = score

        if memory.size() > min_mem_size:
            for i in range(10):
                s, a, r, s_prime, done_mask = memory.sample_b(batch_size)

                q_out = q(s)
                q_a = q_out.gather(1, a)

                max_q_prime = q_target(s_prime).max(1)[0].unsqueeze(1)
                target = r + gamma * max_q_prime * done_mask
                loss = F.smooth_l1_loss(q_a, target)

                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        if n_epi % print_interval == 0 and n_epi != 0:
            q_target.load_state_dict(q.state_dict())
            print(f"[Episode {n_epi:5d}] Score: {score / print_interval:6.2f} | Max score: {max_score / print_interval:6.2f} | Buffer size: {memory.size():5d} | Epsilon: {epsilon * 100:2.1f}%")
            add_scalar("Score",score/print_interval,n_epi,Summary_Writer)
            add_scalar("Max Score",max_score/print_interval,n_epi,Summary_Writer)
            add_scalar("Buffer Size",memory.size() /print_interval,n_epi,Summary_Writer)
            add_scalar("Epsilon",epsilon ,n_epi,Summary_Writer)
            score = 0.0


    env.close()
    Summary_Writer.close()




In [4]:
main()

Path Already Exists
[Episode    20] Score:  12.25 | Max score:  12.20 | Buffer size:   245 | Epsilon: 7.9%
[Episode    40] Score:  15.45 | Max score:  15.40 | Buffer size:   554 | Epsilon: 7.8%
[Episode    60] Score:  11.55 | Max score:  15.40 | Buffer size:   785 | Epsilon: 7.7%
[Episode    80] Score:  10.35 | Max score:  15.40 | Buffer size:   992 | Epsilon: 7.6%
[Episode   100] Score:  12.50 | Max score:  15.40 | Buffer size:  1242 | Epsilon: 7.5%
[Episode   120] Score:  12.05 | Max score:  15.40 | Buffer size:  1483 | Epsilon: 7.4%
[Episode   140] Score:  13.35 | Max score:  15.40 | Buffer size:  1750 | Epsilon: 7.3%
[Episode   160] Score:  11.75 | Max score:  15.40 | Buffer size:  1985 | Epsilon: 7.2%
[Episode   180] Score:  13.50 | Max score:  15.40 | Buffer size:  2255 | Epsilon: 7.1%
[Episode   200] Score:  13.85 | Max score:  15.40 | Buffer size:  2532 | Epsilon: 7.0%
[Episode   220] Score:  20.20 | Max score:  20.15 | Buffer size:  2936 | Epsilon: 6.9%
[Episode   240] Score: 

In [5]:
env.close()

NameError: name 'env' is not defined

In [None]:
q = Qnet().to(device)

In [None]:
history_initial=torch.zeros(4,84,84)

In [None]:
q.sample_action(history_initial.float().to(device).unsqueeze(0), 0)

In [None]:
q(history_initial.float().to(device).unsqueeze(0))