In [1]:
import sys
if not '..' in sys.path:
    sys.path.append('..')
from simplegrid.cow import Action
import numpy as np
import random

from unittest.mock import MagicMock
from shared.experiment_settings import ExperimentSettings
from simplegrid.deep_cow import DeepCow
from simplegrid.dqn_agent import DQNAgent
from simplegrid.world import World as World, MapFeature

In [2]:
settings = ExperimentSettings('')
settings.world_size = 5
settings.start_num_creatures = 0
settings.layers = [12]
DeepCow.agent = None
deepcow = DeepCow(x=2, y=2, energy=100, settings=settings)
world = World(settings, MagicMock())

In [3]:
def training_record(world, cow, grass_fraction=0.25, water_fraction=0.10):
    world.reset(MagicMock(), grass_fraction=grass_fraction, water_fraction=water_fraction)
    cow.x = 2
    cow.y = 2
    world.add_new_creature(cow)
    observation = world.get_observation(cow)
    action = cow.step(observation)
    new_creature, reward, done = world.process_action(cow, action)
    if done:
        next_state = None
    else:
        next_observation = world.get_observation(cow)
        next_state = cow.to_internal_state(next_observation)
    return cow.state, cow.action_idx, reward, next_state

training_record(world, deepcow)

(array([1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]),
 0,
 0.8666666666666667,
 array([0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0.]))

In [12]:
def run_scenario(scenario, world, cow):
    environment = MapFeature.text_scene_to_environment(scenario)
    world.cells = environment
    cow.x = 2
    cow.y = 2
    world.add_new_creature(cow)
    observation = world.get_observation(cow)
    state = cow.to_internal_state(observation)
    act_values = cow.agent.predict(state)
    action_index = np.argmax(act_values[0])
    return Action(action_index + 1), act_values[0][action_index]

run_scenario('.....\n'
             '.....\n'
             '.....\n'
             '..#..\n'
             '.....\n',
             world, deepcow
            )

(DOWN, 0.85918033)

In [6]:
def score(world, deepcow):
    score = 0
    for i in range(1000):
        state, action, reward, next_state = training_record(world, deepcow)
        if next_state is None:
            score -= 10
        else:
            score += reward
    return score / 1000

score(world, deepcow)

-1.8555333333333692

In [7]:
records = [training_record(world, deepcow) for _ in range(10000)]
records[0]

(array([1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 2,
 -0.13333333333333333,
 array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [8]:
agent = deepcow.agent
for rec in records:
    agent.remember(*rec)

In [11]:
for epoch in range(10):
    for __ in range(len(records) // agent.batch_size):
        loss = agent.replay()
    print(epoch, loss)

0 0.1218265960633289
1 0.08151522567107652
2 0.08624613663802544
3 0.07802835010224954
4 0.0956244656505684
5 0.09001865031120057
6 0.06336958744407942
7 0.07285346956147502
8 0.07842030387449389
9 0.08086788182845339


In [30]:
score(world, deepcow)

-2.4280666666666937

In [None]:
run_scenario('.....\n'
             '.....\n'
             '.....\n'
             '..#..\n'
             '.....\n',
             world, deepcow
            )