In [1]:
import gym
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from torch.distributions import Normal
import matplotlib.pyplot as plt
import numpy as np
import time
from collections import deque
import random

In [2]:
class Actor(nn.Module):
    def __init__(self):
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(3, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 16)
        self.fc4 = nn.Linear(16, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.tanh(self.fc4(x))*2 # -1 ~ 1
        
        return x

class Critic(nn.Module):
    def __init__(self):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(3, 64)
        self.fc2 = nn.Linear(64, 32)
        
        self.fc1_1 = nn.Linear(1, 32)
        self.fc3 = nn.Linear(64, 16)
        self.fc4 = nn.Linear(16, 1)

    def forward(self, x1, a1):
        x1 = F.relu(self.fc1(x1))
        x1 = self.fc2(x1)
        a1 = self.fc1_1(a1)
        x1a1 = torch.cat([x1,a1], dim = 1) # [32 + 32]
        x = F.relu(self.fc3(x1a1))
        x = self.fc4(x)
        
        return x

In [3]:
def soft_update(net, target_net, tau):
    for param, target_param in zip(net.parameters(), target_net.parameters()):
        target_param.data.copy_(tau * param.data + (1.0 - tau) * target_param.data)

In [4]:
def ou_noise(x, dim):
    rho = 0.15
    mu = 0
    dt = 1e-1
    sigma = 0.2
    return x + rho*(mu-x)*dt + sigma*np.sqrt(dt)*np.random.normal(size=dim)

In [6]:
def train(actor, critic, actor_target, critic_target, actor_optimizer, critic_optimizer, random_mini_batch):
    
    gamma = 0.99
    eps = 0.001
    
    # data 분배
    mini_batch = np.array(random_mini_batch) 
    states = np.vstack(mini_batch[:, 0]) 
    actions = list(mini_batch[:, 1]) 
    rewards = list(mini_batch[:, 2])
    next_states = np.vstack(mini_batch[:, 3])
    masks = list(mini_batch[:, 4]) 

    # tensor.
    states = torch.Tensor(states)
    actions = torch.Tensor(actions).unsqueeze(1)
    rewards = torch.Tensor(rewards) 
    next_states = torch.Tensor(next_states)
    masks = torch.Tensor(masks)
    
    # actor loss
    actor_loss = -critic(states, actor(states)).mean()
    
    #critic loss
    MSE = torch.nn.MSELoss()

    target = rewards + masks * gamma * critic_target(next_states, actor_target(next_states)).squeeze(1)
    q_value = critic(states, actions).squeeze(1)
    critic_loss = MSE(q_value, target.detach())
    
    # backward.
    actor_optimizer.zero_grad()
    actor_loss.backward()
    actor_optimizer.step()
    
    critic_optimizer.zero_grad()
    critic_loss.backward()
    critic_optimizer.step()
    
    # soft target update
    soft_update(actor, actor_target, eps)
    soft_update(critic, critic_target, eps)

In [7]:
episode = 100000
replay_buffer = deque(maxlen=20000)
batch_size = 32

actor_learning_rate = 0.0001
critic_learning_rate = 0.001

actor = Actor()
actor_target = Actor()

critic = Critic()
critic_target = Critic()

actor_optimizer = optim.Adam(actor.parameters(), lr=actor_learning_rate) ## actor에 대한 optimizer Adam으로 설정하기.
critic_optimizer = optim.Adam(critic.parameters(), lr=critic_learning_rate) ## actor에 대한 optimizer Adam으로 설정하기.


actor_target.load_state_dict(actor.state_dict())
critic_target.load_state_dict(critic.state_dict())

env = gym.make('Pendulum-v0')

action_dim = env.action_space.shape[0]


buffer_size = 0
score = 0
for ep in range(episode):
    state = env.reset() # x0
    pre_noise = np.zeros(action_dim) # noise 0
    
    while True: # episode
        
        action = actor(torch.Tensor(state))
        noise = ou_noise(pre_noise, dim=action_dim)
        action = action + torch.Tensor(noise)
        action = action.clamp(-2.0, 2.0) ## 환경에 action이 들어갈땐 허용가능한 범위로 바꿔준다.

        next_state, reward, done, _ = env.step([action.item()])
        
        mask = 0 if done else 1
        
        replay_buffer.append((state, action, (reward+8)/8, next_state,  mask)) ## 저장
        buffer_size += 1
        if buffer_size > 1000:
            random_mini_batch = random.sample(replay_buffer, batch_size)
            train(actor, critic, actor_target, critic_target, actor_optimizer, critic_optimizer, random_mini_batch)
        
        state = next_state 
        pre_noise = noise
        
        score += reward 
        
        if done:
            break
    if ep % 20 == 0 and ep != 0:
        print('episode: ',ep,' score: ',score/20.0)
        score = 0
env.close()



episode:  20  score:  -1406.9022310501853
episode:  40  score:  -825.8678481687184
episode:  60  score:  -506.02751262980166
episode:  80  score:  -291.8653441491679
episode:  100  score:  -270.8370024087031
episode:  120  score:  -243.3118427430693
episode:  140  score:  -250.3051475037155
episode:  160  score:  -319.59869510794954
episode:  180  score:  -212.58626011630753
episode:  200  score:  -252.76458413678498
episode:  220  score:  -304.80492169043436
episode:  240  score:  -199.1944630808312
episode:  260  score:  -247.6958883625196
episode:  280  score:  -218.8486907882529
episode:  300  score:  -184.71597529668662
episode:  320  score:  -156.41751413777357
episode:  340  score:  -158.6209014059237
episode:  360  score:  -217.83326021881172
episode:  380  score:  -161.25415322803894
episode:  400  score:  -165.38380129048724
episode:  420  score:  -166.87638305254106
episode:  440  score:  -168.15407811947327
episode:  460  score:  -138.43651353175744
episode:  480  score:  -

KeyboardInterrupt: 