In [1]:
import gym
import collections
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.distributions import Categorical

# Set Hyperparameters
LR = 1e-3
GAMMA = 0.98

# Set Model
class QActorCritic(nn.Module):
    def __init__(self):
        super(QActorCritic,self).__init__()
        self.data = []

        self.fc1 = nn.Linear(4,256)
        self.fc_pi = nn.Linear(256,2)
        self.fc_v = nn.Linear(256,1)
        self.optimizer = optim.Adam(self.parameters(),lr=LR)

    def pi(self, x, softmax_dim=0):
        x = F.relu(self.fc1(x))
        x = self.fc_pi(x)
        prob = F.softmax(x, dim=softmax_dim)
        return prob

    def v(self, x):
        x = F.relu(self.fc1(x))
        v = self.fc_v(x)
        return v

    def put_data(self, transition):
        self.data.append(transition)

    def make_batch(self):
        s_lst,a_lst,r_lst,s_prime_lst,done_lst = [],[],[],[],[]
        for transition in self.data:
            s,a,r,s_prime,done = transition
            s_lst.append(s)
            a_lst.append([a])
            r_lst.append([r/100.0])
            s_prime_lst.append(s_prime)
            done_mask = 0.0 if done else 1.0
            done_lst.append([done_mask])

        s_batch, a_batch, r_batch, s_prime_batch, done_batch = torch.tensor(s_lst, dtype=torch.float), torch.tensor(a_lst),torch.tensor(r_lst,dtype=torch.float),torch.tensor(s_prime_lst, dtype =torch.float),torch.tensor(done_lst,dtype=torch.float)

        self.data = []
        return s_batch,a_batch,r_batch,s_prime_batch,done_batch

    def train_net(self):
        s,a,r,s_prime,done = self.make_batch()
        v_prime = self.v(s_prime) * done
        pi = self.pi(s,softmax_dim=1)
        pi_a = pi.gather(1,a)
        pi_loss = -torch.log(pi_a)*self.v(s).detach()
        td_target = r + GAMMA*v_prime
        v_loss = F.smooth_l1_loss(self.v(s),td_target.detach())

        loss = pi_loss + v_loss

        self.optimizer.zero_grad()
        loss.mean().backward()
        self.optimizer.step()

def main():
    env = gym.make('CartPole-v1')
    model = QActorCritic()
    print_interval = 20
    score = 0.0

    for n_epi in range(10000):
        done = False
        s, _ = env.reset()

        while not done:
            prob = model.pi(torch.from_numpy(s).float())
            m = Categorical(prob)
            a = m.sample().item()
            s_prime, r, done, truncated, info = env.step(a)
            model.put_data((s,a,r,s_prime,done))

            s = s_prime
            score += r

            if done:
                break
            model.train_net()
        if n_epi%print_interval==0 and n_epi!=0:
            print("[EPISODE]: {}, [avg score]: {:.1f}".format(n_epi, score/print_interval))
            score = 0.0
    env.close()

if __name__ == "__main__":
    main()

  deprecation(
  deprecation(


ValueError: too many values to unpack (expected 2)