In [1]:
%matplotlib inline

In [2]:
import os

import gym
import math
import random
import numpy as np
import matplotlib
import get_env
import matplotlib.pyplot as plt
from itertools import count
from PIL import Image

# from priority_experience_replay import Memory

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T


env = get_env.get_env(observation_type = "buckets", reward_grid_size = (20,20)) # trying a smaller grid

# set up matplotlib
is_ipython = 'inline' in matplotlib.get_backend()
if is_ipython:
    from IPython import display

plt.ion()

# if gpu is to be used
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

pygame 2.0.1 (SDL 2.0.14, Python 3.7.6)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [3]:
class DDQN(nn.Module):

    def __init__(self):
        super(DDQN, self).__init__()
        self.inputlayer= torch.nn.Linear(18, 32, bias=True)
        self.hl1= torch.nn.Linear(32, 16, bias=True)
        self.hl2= torch.nn.Linear(16, 8, bias=True)
        self.value= torch.nn.Linear(8, 1, bias=True)
        self.adv= torch.nn.Linear(8, 9, bias=True)

    # Called with either one element to determine next action, or a batch
    # during optimization. Returns tensor([[left0exp,right0exp]...]).
    def forward(self, x):
        x = x.to(device)
        x = x.view(x.size(0), -1) # flatten but conserve batches
        x = F.relu(self.inputlayer(x))
        x = F.relu(self.hl1(x))
        x = F.relu(self.hl2(x))
        v = self.value(x)
        a = self.adv(x)
        q = v + a - a.mean(dim=1, keepdim=True)
        return q

In [4]:

from collections import deque

class Memory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def add(self, experience):
        """Save a transition"""
        self.memory.append(experience)

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

def append_experience(experience):
    memory.add(experience)

In [5]:
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.5
EPS_END = 0.05
EPS_DECAY = 5
TARGET_UPDATE = 2
NUM_SKIP = 40
ALLOW_0 = 50 # allow action 0 (no movement) after 50 episodes

policy_net = DDQN().to(device)
target_net = DDQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.RMSprop(policy_net.parameters(), lr=0.001)
memory = Memory(20000)


def select_action(state, episode):
    # if greedy action is 0 for episodes before allow 0, take a random action
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * math.exp(-1. * episode / EPS_DECAY)
    if sample > eps_threshold:
        with torch.no_grad():
            a =  policy_net(state).max(1)[1].view(1, 1)
        if a == 0 and episode < ALLOW_0:
            return torch.tensor([[random.randrange(9)]], device=device, dtype=torch.long)
        else:
            return a
    else:
        return torch.tensor([[random.randrange(9)]], device=device, dtype=torch.long)

In [6]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    
    batch = memory.sample(BATCH_SIZE)
    batch = [*zip(*batch)]

    state_batch = torch.cat(batch[0])
    action_batch = torch.cat(batch[1])
    reward_batch = torch.cat(batch[2])
    next_states_batch = torch.cat(batch[3])
    dones_batch = torch.cat(batch[4])

    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    state_action_values = policy_net(state_batch).gather(1, action_batch)

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # Compute the expected Q values
    next_state_values = target_net(next_states_batch).max(1)[0].detach()
    expected_state_action_values = (next_state_values * GAMMA) * (1 - dones_batch) + reward_batch
    expected_state_action_values = expected_state_action_values.unsqueeze(1)

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values)

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [7]:
def motivation(state, action):
    # buckets state assumed - size 8,2
    global steps_done
    x = state[:,0] * torch.exp(-0.1 * state[:,1])
    motivation = 0.001 * (action != 0) * torch.max(x)
    return motivation

In [8]:
def make_state(observation, info):
    observation[:,0]/=100
    observation[:,1]/=1000
    rel_cordinates = np.array(info['relative_coordinates'])/10000
    x = np.concatenate([observation.flatten(), rel_cordinates])
    return torch.tensor([x], device=device, dtype=torch.float32)

In [9]:
if not os.path.exists('targetnet_ddqn'): os.makedirs('targetnet_ddqn') # save folder for value networks

1. ALLOW_0: take random action if the greedy action is 0 (no movement) for all episodes < ALLOW_0
2. Kill agent if it fails to collect berry within the first minute
3. kill agent if cumilative reward reaches 0

In [13]:
env.verbose = False
episode_durations = []
episode_rewards = []
episode_berrypicked = []
num_episodes = 250
for i_episode in range(num_episodes):

    # Initialize the environment and state
    observation, done = env.reset()
    info = env.get_info()

    state = make_state(observation, info)
    num_steps = 0

    while True:
        
        # Select and perform an action
        action = select_action(state, episode=10000)
        for _ in range(NUM_SKIP-1):
            next_observation, env_reward, done, next_info = env.step(action.item())
            num_steps += 1

            reward = torch.tensor([env_reward], device=device, dtype=torch.float32)
            done = torch.tensor([done], device=device, dtype=torch.int)

            # Observe new state
            next_state = make_state(next_observation, next_info)

            # Store the transition in memory
            append_experience((state, action, reward, next_state, done))

            if done: break

        # Move to the next state
        state = next_state
                
        env.render()
        if env.cummulative_reward < 0:
            print("episode: ", i_episode, "agent died of hunger. Picked:", env.get_numBerriesPicked())
            episode_durations.append(num_steps)
            episode_rewards.append(env.cummulative_reward)
            episode_berrypicked.append(env.get_numBerriesPicked())
            break            

        if done:
            print("episode: ", i_episode, " reward: ", env.cummulative_reward, "Picked:", env.get_numBerriesPicked())
            episode_durations.append(num_steps)
            episode_rewards.append(env.cummulative_reward)
            episode_berrypicked.append(env.get_numBerriesPicked())
            break

        if num_steps > 400*60 and env.get_numBerriesPicked() == 0:
            print("episode: ", i_episode, "agent killed for being berryless for too long.")
            episode_durations.append(num_steps)
            episode_rewards.append(env.cummulative_reward)
            episode_berrypicked.append(env.get_numBerriesPicked())
            break
    
        # Perform one step of the optimization (on the policy network)
        optimize_model()

    # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
        torch.save(target_net.state_dict(), f'targetnet_ddqn/targetnet_ep{i_episode}.pth')

print('Complete')
# env.render()
env.close()
plt.ioff()
plt.show()

In [None]:
env.close()