# Thesis

In [1]:
import torch
import numpy as np
import matplotlib.pyplot as plt

from grid_cells import get_grid_cells, purge_delinquent_cells
from environment import GridCellWorld
from utils import get_coords, to_tensor, device, get_loc_batch
from agent import Agent
from tqdm import tqdm

In [2]:
def eval_locomotion(agent, env, n_ep=200, maxlen=50):
    rewards, lengths = list(), list()
    print("Starting evaluation")
    for _ in tqdm(range(n_ep)):
        reward = list()
        s = env.reset(end_radius=0.25)
        step = 0
        done = False
        while not done and step < maxlen:
            a = agent.choose_action(s, evaluate=True)
            s_new, r, done = env.next_state(a)
            s = s_new
            step += 1
            reward.append(r)
        rewards.append(sum(reward))
        lengths.append(step)
    return sum(rewards) / len(rewards), sum(lengths) / len(lengths)

def eval_position(agent, size=4096):
    losses = list()
    for _ in range(size // 256):
        x, y = get_loc_batch(c, gc, bs=256)
        x = agent.actor(x)[1]
        loss = torch.sum((x - y)**2).detach().cpu().numpy()
        losses.append(loss)
    return np.mean(losses)

In [3]:
resolution = 400
MIN, MAX = -1, 1
N = 100
bs=8
actor_hidden=128
critic_hidden=256
lr_a=1e-4
lr_c=1e-4
grad_norm=0.2
buffer_length=50

In [4]:
coords = get_coords(resolution, MIN, MAX)
# grid_cells = get_grid_cells(np.linspace(40, 150, 30), N, resolution)
# grid_cells = purge_delinquent_cells(grid_cells)
grid_cells = np.load("grid_cells.npy")
c, gc = to_tensor(coords), to_tensor(grid_cells[:,:,::4])

In [5]:
env = GridCellWorld(gc, c, debug=False)
agent = Agent(grid_cells.shape[-1] // 4, 2, action_amp=0.1,
              bs=bs,
              actor_hidden=actor_hidden,
              critic_hidden=critic_hidden,
              lr_a=lr_a,
              lr_c=lr_c,
              grad_norm=grad_norm,
              buffer_length=buffer_length)
# state_trans = torch.randn(2, 3000, device=device)

In [None]:
n_episodes = 30000
episode_max_len = 20

rewards = list()
lengths = list()
for ep in tqdm(range(n_episodes)):
    done = False
    step = 0
    reward = list()
    s, loc = env.reset(end_radius=0.25), env.state
    # s = s @ state_trans
    while not done and step < episode_max_len:
        a = agent.choose_action(s)
        s_new, r, done = env.next_state(a)
        # s_new = s_new @ state_trans
        agent.remember(s, a, r, s_new, done, loc)
        agent.learn()
        s, loc = s_new, env.state
        step += 1
        reward.append(r)
    lengths.append(step)
    rewards.append(sum(reward).cpu())

In [None]:
print("Agent reward - %.03f\nlengths - %.03f" % eval_locomotion(agent, env))
print(f"Position loss - {eval_position(agent):.03f}")

In [None]:
_, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(18, 6))
ax1.plot(to_tensor(rewards).cpu())
ax2.scatter(np.arange(len(lengths)), lengths)
ax3.hist(np.arange(len(lengths))[np.asarray(lengths) < episode_max_len])
plt.show()