In [None]:
import numpy as np
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
from hparams import HyperParams as hp
from env import Env

In [None]:
num_inputs = 1
num_actions = 1
episodes = 2

## Model

In [None]:
class Actor(nn.Module):
    def __init__(self, num_inputs, num_outputs):
        self.num_inputs = num_inputs
        self.num_outputs = num_outputs
        super(Actor, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hp.hidden)
        self.fc2 = nn.Linear(hp.hidden, hp.hidden)
        self.fc3 = nn.Linear(hp.hidden, num_outputs)
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        mu = self.fc3(x)
        logstd = torch.zeros_like(mu)
        std = torch.exp(logstd)
        return mu, std, logstd


class Critic(nn.Module):
    def __init__(self, num_inputs):
        super(Critic, self).__init__()
        self.fc1 = nn.Linear(num_inputs, hp.hidden)
        self.fc2 = nn.Linear(hp.hidden, hp.hidden)
        self.fc3 = nn.Linear(hp.hidden, 1)
        self.fc3.weight.data.mul_(0.1)
        self.fc3.bias.data.mul_(0.0)

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        v = self.fc3(x)
        return v

## Utils

In [None]:
def get_action(mu, std):
    action = torch.normal(mu, std)
    action = action.data.numpy()
    return action

In [None]:
get_action(torch.tensor([1.,1.]), torch.tensor([2.,0.]))

## Main

### evaluation

In [None]:
env = Env()
actor = Actor(num_inputs, num_actions)
critic = Critic(num_inputs)

In [None]:
# Sets the module in evaluation mode.
actor.eval()
critic.eval()
memory = deque()
scores = []

for episode in range(episodes):
    print('episode {} start:'.format(episode))
    state = env.reset()
    score = 0
    
    for i in range(10):
        mu, std, _ = actor(torch.tensor((state), dtype=torch.float32).unsqueeze(0))
        action = get_action(mu, std)[0]
        next_state, reward, done, _ = env.step()
        if done:
            mask = 0
        else:
            mask = 1
            
        memory.append([state, action, reward, mask])
        score += reward
        state = next_state
        
        if done:
            break
        
    scores.append(score)
        
score_avg = np.mean(scores)
print('{} episode score is {:.2f}'.format(episodes, score_avg))
print(memory)

### training

In [None]:
actor.train(), critic.train()
# train_model(actor, critic, memory, actor_optim, critic_optim)

In [None]:
def train_model(actor, critic, memory, actor_optim, critic_optim):
    memory = np.array(memory)
    states = np.vstack(memory[:, 0])
    actions = list(memory[:, 1])
    rewards = list(memory[:, 2])
    masks = list(memory[:, 3])

    returns = get_returns(rewards, masks)
    train_critic(critic, states, returns, critic_optim)
    train_actor(actor, returns, states, actions, actor_optim)
    return returns


In [None]:
def get_returns(rewards, masks):
    rewards = torch.Tensor(rewards)
    masks = torch.Tensor(masks)
    returns = torch.zeros_like(rewards)

    running_returns = 0

    for t in reversed(range(0, len(rewards))):
        running_returns = rewards[t] + hp.gamma * running_returns * masks[t]
        returns[t] = running_returns

    returns = (returns - returns.mean()) / returns.std()
    return returns

In [None]:
memory

# 