In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal
import matplotlib.pyplot as plt
import numpy as np
import time
from collections import deque
import random

In [2]:
class Actor(nn.Module):
    def __init__(self, state_space):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(state_space, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.mu_head = nn.Linear(16, 1)
        self.std_head = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        mu = torch.tanh(self.mu_head(x))*2 ## tanh -1.0 ~ 1.0 이라서. 대신 mu값이 -2.0 < mu < 2.0 이여야 한다.
        std = F.softplus(self.std_head(x)) ## softplus 함수 보면 relu 부드러운 버전임. std가 0이 되지 않도록하기 위해 softplus 씀.
        return mu, std
    
class Critic(nn.Module):
    def __init__(self, state_space):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(state_space, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 1)
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        v = self.fc4(x)
        return v

In [3]:
def log_pdf(mu, std, action): ## origin style
    std = std.clamp(0.01, 1.0) # 표준편차의 최솟값 최댓값. 1로 고정해도 되지만.. 아무래도 좀 더 유연성을 주는게 좋아보임.
    var = std**2 # 분산. variance
    log_policy_pdf = -0.5*(action-mu)**2/var - 0.5*torch.log(var*2*torch.asin(torch.tensor(1.)))# torch.asin(torch.tensor(1.)) = pi. 찾아보니 pytorch는 pi 라이브러리가 없음 ㅠ
    return torch.sum(log_policy_pdf, dim=1, keepdim=True)

In [4]:
def log_pdf_pytorch_ver(mu, std, action): ## torch style
    std = std.clamp(0.01, 1.0) # 표준편차의 최솟값 최댓값. 1로 고정해도 되지만.. 아무래도 좀 더 유연성을 주는게 좋아보임.
    normal = Normal(mu, std) ## 대표값과 표준편차 값을 통해 normal distribution = gaussian distribution으로 만들어 준다.
    log_policy_pdf = normal.log_prob(action) # pytorch에서 log_prob 함수를 통해 action에 맞게 계산해준다.
    return log_policy_pdf

In [5]:
def get_action(mu, std):
    std = std.clamp(0.01, 1.0) # 표준편차의 최솟값 최댓값. 1로 고정해도 되지만.. 아무래도 좀 더 유연성을 주는게 좋아보임.
    normal = Normal(mu, std) ## 대표값과 표준편차 값을 통해 normal distribution = gaussian distribution으로 만들어 준다.
    action = normal.sample() ## 추정된 mu, std를 통해 만들어진 가우시안 분포도에서 샘플 1개 선택!, 확률적으로 선택됨.
    action = action.clamp(-2.0, 2.0) ## 환경에 action이 들어갈땐 허용가능한 범위로 바꿔준다.
    return action

In [6]:
def td_target(critic, states, rewards, masks):
    gamma = 0.99
    
    target_y = torch.zeros_like(rewards)
    values = critic(states).view(-1)
    next_value = 0
    
    for t in reversed(range(0,len(rewards))):
        target_y[t] = rewards[t] + gamma*next_value*masks[t] ## 특정 t 시점부터 next~t에 해당하는 reward가 discounted 됨으로 앞으로 가면서 누적 gamma 곱 해준다.
        next_value = values.data[t]
    return target_y

In [7]:
def train(actor, critic, actor_optimizer, critic_optimizer, replay_buffer):
    
    
    # data 분배
    replay_buffer = np.array(replay_buffer) 
    states = np.vstack(replay_buffer[:, 0]) 
    actions = list(replay_buffer[:, 1])
    rewards = list(replay_buffer[:, 2])
    next_states = np.vstack(replay_buffer[:, 3])
    masks = list(replay_buffer[:, 4]) 

    # tensor.
    states = torch.Tensor(states)
    actions = torch.Tensor(actions).unsqueeze(1)
    rewards = torch.Tensor(rewards) 
    next_states = torch.Tensor(next_states)
    masks = torch.Tensor(masks)
    
    # actor_loss
    mu, std = actor(states)
    #log_prob = log_pdf(mu, std, action).view(-1) ## G와 shape 맞춰주기 위해서 [data,1] -> [data]
    log_prob = log_pdf_pytorch_ver(mu, std, actions).view(-1) ## 위 방식과 같다. 다만 pytorch 라이브러리에 더 의존적.
    target_y = td_target(critic, states, rewards, masks)
    values = critic(states).view(-1)
    
    advantages = target_y - values
    actor_loss = torch.sum(-log_prob*advantages.detach())
    
    # critic_loss
    mse_loss = torch.nn.MSELoss()
    critic_loss = mse_loss(target_y.detach(), values)
    
    # backward
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()
    
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()

In [8]:
episode = 100000

actor_learning_rate = 0.0001
critic_learning_rate = 0.001
env = gym.make('Pendulum-v0')

state_space = env.observation_space.shape[0]
action_space = env.action_space.shape[0]

actor = Actor(state_space) # out action은 1로 고정.
actor_optimizer = optim.Adam(actor.parameters(), lr=actor_learning_rate) ## actor에 대한 optimizer Adam으로 설정하기.
critic = Critic(state_space)
critic_optimizer = optim.Adam(critic.parameters(), lr=critic_learning_rate) ## critic에 대한 optimizer Adam으로 설정하기.

batch_size = 32
replay_buffer = deque(maxlen=batch_size) # on-policy method로 업데이트 후 data buffer를 초기화 해준다.

score = 0
step = 0

for ep in range(episode):
    state = env.reset() # x0
    
    while True: # episode
        
        mu, std = actor(torch.Tensor(state))
        action = get_action(mu,std)
        
        next_state, reward, done, _ = env.step([action.item()])
        
        mask = 0 if done else 1
        
        replay_buffer.append((state, action, (reward+8)/8, next_state,  mask)) ## 저장
        
        state = next_state 
        score += reward
        
        if step % batch_size == 0 and step != 0:
            train(actor, critic, actor_optimizer, critic_optimizer, replay_buffer) # batch마다 train한다.
            replay_buffer = deque(maxlen=batch_size) # on-policy method로 업데이트 후 data buffer를 초기화 해준다.
        step += 1
        if done:
            break
    
    
    if ep % 20 == 0 and ep != 0:
        print('episode: ',ep,' score: ',score/20.0)
        score = 0
env.close()

episode:  20  score:  -1414.9114498134768
episode:  40  score:  -1268.4410006454098
episode:  60  score:  -1236.5634665889506
episode:  80  score:  -1273.5225299809592
episode:  100  score:  -1256.0191640440298
episode:  120  score:  -1236.814480385676
episode:  140  score:  -1265.0243436459402
episode:  160  score:  -1241.9399420233751
episode:  180  score:  -1227.6943113938005
episode:  200  score:  -1269.0137055581654
episode:  220  score:  -1213.6920068734462
episode:  240  score:  -1195.2041180740205
episode:  260  score:  -1175.238928880251
episode:  280  score:  -1203.7430331281507
episode:  300  score:  -1261.3430795308261
episode:  320  score:  -1246.4221620508508
episode:  340  score:  -1243.3974725664043
episode:  360  score:  -1254.3187033049019
episode:  380  score:  -1214.7394183657777
episode:  400  score:  -1184.1807715009531
episode:  420  score:  -1207.684997886287
episode:  440  score:  -1255.6060666622975
episode:  460  score:  -1157.316347155787
episode:  480  scor

KeyboardInterrupt: 