In [1]:
import random
import gym
import numpy as np
import timeit
import matplotlib.pyplot as plt
import os
import subprocess

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import torchvision
import torchvision.transforms as transforms

import torchsummary
from torchsummary import summary
from collections import deque

def play(env, g):
    state = env.reset()
    step = 0
    done = False
    while done is not True:
        env.render()
        step += 1
        action = g.act(state)
        next_state, reward, done, info = env.step(action)
        state = next_state
        if done:
            print('step = {}, reward = {}'.format(step, reward))

class Qnet(nn.Module):
    def __init__(self, state_size, action_size):
        super(Qnet, self).__init__()
        self.fc1 = nn.Linear(state_size, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        out = self.fc3(x)
        return out

class agent(object):
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        
        self.memories = deque(maxlen = 1024)

        self.batch_size = 32 # for speed up
        self.gamma = 0.99
        self.model_online = Qnet(state_size, action_size)#.to('cuda')
        #self.model_target = Qnet(state_size, action_size).to('cuda')

        print(self.model_online)
        summary(self.model_online, (state_size, ))

        self.optimizer_online = optim.Adam(self.model_online.parameters(), lr=0.0001)
        #self.optimizer_target = optim.Adam(self.model_target.parameters(), lr=0.0001)

    def store(self, state_torch, action, reward, next_state_torch, done):
        #self.states.append(state_torch)
        #self.rewards.append(reward)
        #label = np.zeros(self.action_size)
        #label[action] = 1
        #self.labels.append(label)
        #self.next_states.append(next_state_torch)
        terminal = 1
        if done:
            terminal = 0

        transition = [state_torch, action, reward, next_state_torch, terminal]
        self.memories.append(transition)

    def train(self):
        if(len(self.memories) < self.batch_size):
            return

        batch_data = random.sample(self.memories, self.batch_size)
        state_torch = [data[0] for data in batch_data]
        action = [data[1] for data in batch_data]
        reward = [data[2] for data in batch_data]
        next_state_torch = [data[3] for data in batch_data]
        terminal = [data[4] for data in batch_data]


        batch_state_torch = torch.cat(state_torch)
        batch_next_state_torch = torch.cat(next_state_torch)
        batch_action = torch.tensor(action)#.to('cuda') #unsqueeze(0).to('cuda')
        reward_torch = torch.tensor(reward)#.to('cuda')
        terminal_torch = torch.tensor(terminal)#.to('cuda')
        
        self.model_online.eval()
        result = self.model_online(batch_state_torch)
        #print("result")
        #print(result)
        state_action_torch = torch.gather(result, 1, batch_action.unsqueeze(1))
        #print("state_action_torch")
        #print(state_action_torch)
        next_state_action_torch = self.model_online(batch_next_state_torch)
        #print(next_state_action_torch)
        next_state_action_torch = torch.max(next_state_action_torch, 1)[0].detach()
        #print(next_state_action_torch)
        
        #print("reward_torch")
        #print(reward_torch)
        Y = reward_torch + (self.gamma * next_state_action_torch * terminal_torch)
        #print(Y)

        self.model_online.train()
        loss = F.mse_loss(state_action_torch, Y.unsqueeze(1)) / self.batch_size
        self.optimizer_online.zero_grad()
        loss.backward()
        self.optimizer_online.step()


    def act(self, state):
        state_torch = torch.from_numpy(state).type(torch.FloatTensor).unsqueeze(0)#.to('cuda')
        self.model_online.eval()
        Qfunc_s_a = self.model_online(state_torch)
        action = Qfunc_s_a.data.max(1)[1].item()
        return action

    def act_epsilon(self, state_torch, epsilon):
        
        #print(state)
        #state_torch = torch.from_numpy(state).type(torch.FloatTensor).unsqueeze(0).to('cuda')
        #print(state_torch)
        #state_torch = torch.unsqueeze(state_torch, 0)
        #print(state_torch)
        #state_torch = state_torch.to('cuda')
        #print("state_torch")
        #print(state_torch)
        self.model_online.eval()
        Qfunc_s_a = self.model_online(state_torch)
        #print("Qfunc_s_a")
        #print(Qfunc_s_a)
        
        if random.random() < epsilon:
            action = np.random.choice(range(self.action_size))
        else:
            action = Qfunc_s_a.data.max(1)[1].item()
            #print(action)
        return action


output = subprocess.check_output("date +%y%m%d_%H%M%S", shell=True)
output = output.decode('utf-8').replace('\n','')
result_filename = "score_result_" + output + ".csv"
result_file = open(result_filename, mode='w')

env = gym.make('LunarLander-v2')
state = env.reset()
score = 0
total_score = 0
episode = 0
state_size = 8
action_size = env.action_space.n

g = agent(state_size, action_size)
start_time = timeit.default_timer()
result_file.write("episode,score,total_score,eval_score\n")
epsilon = 0.5

while episode <= 3000:  # episode loop
    episode = episode + 1
    state = env.reset()
    score = 0
    done = False
    
    while not done:
        state_torch = torch.from_numpy(state).type(torch.FloatTensor).unsqueeze(0)#.to('cuda')
        action = g.act_epsilon(state_torch, epsilon * (0.998**episode)) #epsilon * (1 / episode))
        
        next_state, reward, done, info = env.step(action)
        next_state_torch = torch.from_numpy(next_state).type(torch.FloatTensor).unsqueeze(0)#.to('cuda')

        g.store(state_torch, action, reward, next_state_torch, done)
        g.train()

        state = next_state

        score = score + reward
        total_score = total_score + reward

    eval_score = ((total_score + 554120) / 483370) * 100.
    result_file.write('{},{:.2f},{:.2f},{:.2f}\n'.format(episode, score, total_score, eval_score))

    if episode % 25 == 0:
        print('Episode: {} Score: {:.2f} Total score: {:.2f} Eval score : {:.2f}'.format(episode, score, total_score, eval_score))
        print('25 Episode time : {:.2f}s'.format((timeit.default_timer() - start_time)))
        start_time = timeit.default_timer()
        play(env, g)


# TEST     
episode = 0
state = env.reset()
step = 0
while episode < 10:  # episode loop
    play(env, g)
    episode += 1
env.close()

result_file.close()

Qnet(
  (fc1): Linear(in_features=8, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=32, bias=True)
  (fc3): Linear(in_features=32, out_features=4, bias=True)
)
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 32]             288
            Linear-2                   [-1, 32]           1,056
            Linear-3                    [-1, 4]             132
Total params: 1,476
Trainable params: 1,476
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.01
----------------------------------------------------------------


AttributeError: 'numpy.int64' object has no attribute 'float'