In [1]:
import gym
import numpy as np
import matplotlib.pyplot as plt
from gym.envs.registration import register
import random

In [2]:
env = gym.make('CartPole-v0')
env = env.unwrapped

In [3]:
from collections import deque
import random 

class ReplayMemory(object):
    def __init__(self, capacity):
        self.deque = deque(maxlen=capacity)

    def append(self, state, action, reward, next_state, done):
        if np.ndim(state) and np.ndim(next_state) == 1:
            state = np.expand_dims(state, 0)
            next_state = np.expand_dims(next_state, 0)
            
        self.deque.append((state, action, reward, next_state, done))
    
    #def sample(self, batch_size):
    #    state, action, reward, next_state, done = zip(*random.sample(self.deque, batch_size))
    #    return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def pop_episode(self):
        state, action, reward, next_state, done = zip(*[self.deque.popleft() for _ in range(len(self))])
        return np.concatenate(state), action, reward, np.concatenate(next_state), done
    
    def reset(self):
        [self.deque.pop() for _ in range(len(self))]
        return
        
    def __len__(self):
        return len(self.deque)

# Policy Gradient

In [4]:
import numpy as np
import torch
import torch.nn as nn

In [5]:
class PolicyNet(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(PolicyNet, self).__init__()     
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim
        
        self.lin = nn.Sequential(
            nn.Linear(input_dim, hidden_dim), nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def convert_to_tensor(self, x):
        return torch.tensor(x, dtype=torch.float)
    
    def forward(self, state):
        if type(state) != torch.tensor:
            state = self.convert_to_tensor(state)
        
        if state.dim() == 1:
            state.unsqueeze_(0)
        
        out = self.lin(state)
        softmax = torch.softmax(out, dim=1)
        return softmax

In [6]:
input_dim = 4
hidden_dim = 64
out_dim = env.action_space.n

In [7]:
policy_net = PolicyNet(input_dim, hidden_dim, out_dim)

In [9]:
class Fitter():
    def __init__(self, policy_net):
        self.gamma = 0.99
        self.policy_net = policy_net
        self.lr = 0.01
        self.optim = torch.optim.Adam(self.policy_net.parameters(), lr=self.lr)
        self.replay_memory = ReplayMemory(capacity=10000)    
        
    def run_episode(self):
        # reset episode
        s = env.reset()
        self.replay_memory.reset()
        self.r_sum = 0
        
        done = False
        while not done:
            a_prob = self.policy_net(s) # pi(a|s)
            a_dist = torch.distributions.Categorical(a_prob) # pi(a|s)
            a = a_dist.sample().item() # a
            s_new, r, done, _ = env.step(a)
            self.r_sum += r
            
            if done :
                r = -10
            
            self.replay_memory.append(s, a, r, s_new, done)
            s = s_new
            
            if self.r_sum > 10000:
                print('very nice!')
                return True
            
        print('Total reward : %s'%self.r_sum)
        return
    
    def compute_loss(self):
        # get mini-batch from replay-memory
        S, A, R, S_next, D = self.replay_memory.pop_episode()
        A = torch.tensor(A, dtype=torch.long)
        
        # calculate Gt
        gt_ls = []
        for i, r in enumerate(reversed(R)):
            if i == 0:
                gt = r
            else:
                gt = self.gamma*previous_gt + r
            
            gt_ls.insert(0, gt)
            previous_gt = gt
            
        # normalize Gt to At
        gt_mean = np.mean(gt_ls)
        gt_std = np.std(gt_ls)
        for i, gt in enumerate(gt_ls):
            gt_ls[i] = (gt - gt_mean)/gt_std
        
        gt_tensor = torch.tensor(gt_ls)

        # compute loss and gradient descent
        a_prob = policy_net(S) # pi(a_t|s_t)
        a_dist = torch.distributions.Categorical(a_prob) # pi(a_t|s_t)
        log_policy_dist = a_dist.log_prob(A) # A : true actions

        loss = torch.sum(-log_policy_dist * gt_tensor)  # -score_function * at
        return loss
        
    def train(self, n_episode):
        for i in range(n_episode):
            terminal = self.run_episode()
            if terminal :
                print('학습을 종료합니다.')
                break
            loss = self.compute_loss()
            
            # train
            self.optim.zero_grad()
            loss.backward()
            self.optim.step()

In [10]:
fitter = Fitter(policy_net)

In [11]:
n_episode = 1000

fitter.train(n_episode)

Total reward : 16.0
Total reward : 33.0
Total reward : 10.0
Total reward : 18.0
Total reward : 13.0
Total reward : 52.0
Total reward : 44.0
Total reward : 20.0
Total reward : 45.0
Total reward : 19.0
Total reward : 37.0
Total reward : 22.0
Total reward : 41.0
Total reward : 14.0
Total reward : 57.0
Total reward : 38.0
Total reward : 27.0
Total reward : 39.0
Total reward : 15.0
Total reward : 43.0
Total reward : 46.0
Total reward : 121.0
Total reward : 58.0
Total reward : 59.0
Total reward : 21.0
Total reward : 17.0
Total reward : 47.0
Total reward : 57.0
Total reward : 63.0
Total reward : 152.0
Total reward : 38.0
Total reward : 15.0
Total reward : 73.0
Total reward : 76.0
Total reward : 21.0
Total reward : 30.0
Total reward : 31.0
Total reward : 70.0
Total reward : 13.0
Total reward : 24.0
Total reward : 180.0
Total reward : 13.0
Total reward : 18.0
Total reward : 18.0
Total reward : 101.0
Total reward : 37.0
Total reward : 38.0
Total reward : 17.0
Total reward : 41.0
Total reward : 5