In [None]:
%matplotlib inline

import time
import copy
import tqdm
import random
import pickle

import gym
import gym_sokoban

from utils import *
from network import *
from exploration import *

from dqn import *
from ppo import *
from replay import *

from astar import *
from mcts import *
from mcts_improved import *

from manage import *
startSession()

In [None]:
# create test Sokoban problems

sokoban_test = Sokoban(count=1000)
sokoban_test.all_envs += Sokoban(count=1000, size=(14, 19), steps=100).all_envs # bigger problems
with open('data/sokoban_test.pkl', 'wb') as f:
    pickle.dump(sokoban_test, f)

sokoban_test_small = Sokoban(count=0)
for i in range(1, 4):
    sokoban_test_small.all_envs += Sokoban(boxes=i, count=50, size=(7, 7)).all_envs
with open('data/sokoban_test_small.pkl', 'wb') as f:
    pickle.dump(sokoban_test_small, f)

In [None]:
# load small test envs
with open('data/sokoban_test_small.pkl', 'rb') as f:
    sokoban_test_small = pickle.load(f)


# select envs to test on
def select(boxes):
    indizes = []
    for i in range(len(sokoban_test_small.all_envs)):
        env = sokoban_test_small.get(i)[0]
        if boxes[env.num_boxes - 1] > 0:
            boxes[env.num_boxes - 1] -= 1
            indizes.append(i)
            #draw(env, "images/exploration/test_{}.png".format(i))
        if sum(boxes) == 0: break

    indizes.sort(key = lambda entry: sokoban_test_small.all_envs[entry][0].num_boxes)
    return indizes

In [None]:
# test time usage of gym action and env copy

time_copy = 0
time_step = 0

for i in range(split):
    env = sokoban_test_small.get(i)[0]

    # copy
    start = time.time()
    copy.deepcopy(env)
    time_copy += time.time() - start

    # step
    start = time.time()
    env.step(0, 'tiny_rgb_array')
    time_step += time.time() - start

# ff takes ~0.01 seconds to solve most problems
print(time_copy / split, time_step / split)

<h2>Fast Forward</h2>

In [None]:
# analyze solutions for generated sokoban problems

def analyze(problems):
    max_length = [0] * 3
    length = np.zeros((500, 3))
    for env, plan in problems.all_envs:
        solution = len(plan)
        if solution > max_length[env.num_boxes - 1]:
            max_length[env.num_boxes - 1] = solution
        length[solution][env.num_boxes - 1] += 1

    print(max_length)
    x = np.arange(max(max_length) + 1)
    x_max = min(100, x[-1])
    for i in range(3):
        label = '1 Box' if i == 0 else '{} Boxes'.format(i + 1)
        x_range = min(x_max, max_length[i])
        average_length = np.sum(length[:len(x), i] * x) / np.sum(length[:len(x), i])
        plt.axvline(average_length, linestyle=':', color='C{}'.format(i))
        plt.plot(x[:x_range], length[:x_range, i], label=label, color='C{}'.format(i))

    plt.ylabel('Count')
    plt.xlabel('Length')
    plt.legend()
    plt.savefig('data/sokoban_train.png', bbox_inches='tight', dpi=200)
    plt.show()


with open('data/sokoban_train.pkl', 'rb') as f: analyze(pickle.load(f))

In [None]:
# planner behaviour

for i in range(len(sokoban_test_small.all_envs)):
    env, plan, _ = sokoban_test_small.get(i)
    if env.num_boxes > 2:
        for action in range(len(plan)):
            done = env.step(action_translator[plan[action]], 'tiny_rgb_array')[2]
            if done: assert(action + 1 == len(plan))
            plan_new = solve(env)
            if (len(plan_new) * 0.8) > (len(plan) - action - 1):
                draw(sokoban_test_small.get(i)[0], 'images/behaviour/state-{}.png'.format(i))
                draw(env, 'images/behaviour/state-{}-sub.png'.format(i))
                print(len(plan), len(plan[action + 1:]))
                print(len(plan_new))
                break

<h2>Search</h2>

In [None]:
def evaluate_s(callback, problems):
    total_envs = np.zeros((3,))
    total_solved = np.zeros((3,))
    total_time = [[], [], []]
    total_quality = [[], [], []]

    for env, plan in tqdm(problems.all_envs):
        envc = copy.deepcopy(env)

        start = time.time()
        path = callback(envc)
        end = time.time()

        total_envs[env.num_boxes - 1] += 1
        if path:
            total_solved[env.num_boxes - 1] += 1
            total_time[env.num_boxes - 1].append(end - start)
            total_quality[env.num_boxes - 1].append(len(path) / len(plan))

    for i in range(3):
        output = "& {} & {} & {} ({}) & {} ({})"
        print(output.format(i + 1, np.round(total_solved[i] / total_envs[i], 2),
                            np.round(np.average(total_time[i]), 2), np.round(np.std(total_time[i]), 2),
                            np.round(np.average(total_quality[i]), 2), np.round(np.std(total_quality[i]), 2)))

In [None]:
Config.early_stop = False


print('astar, BFS as no heuristic is used')
evaluate_s(lambda env: search_astar(env, cutoff=1e4), sokoban_test_small)

print('mcts')
evaluate_s(lambda env: search_mcts(env, cutoff=1e4, prune=False), sokoban_test_small)

print('mcts, pruned')
evaluate_s(lambda env: search_mcts(env, cutoff=1e4, prune=True), sokoban_test_small)

print('mcts_improved')
evaluate_s(lambda env: search_mcts_improved(env, cutoff=1e4, prune=False), sokoban_test_small)

print('mcts_improved, pruned')
evaluate_s(lambda env: search_mcts_improved(env, cutoff=1e4, prune=True), sokoban_test_small)

<h2>Exploration</h2>

In [None]:
def evaluate_e(env, plan, exploration, ff=True):
    agent = DQNAgent(exploration=exploration)
    replay = Replay(augment=False)

    iterations = 0
    while True:
        iterations += 1
        dqn_episode(copy.deepcopy(env), agent, replay, max_steps=int(len(plan) * 1.5))

        if ff and iterations % 10 == 0: # add ff transitions
            dqn_episode(copy.deepcopy(env), agent, replay, path=plan, train=False) # comparable gradient steps

        # evaluation
        total_reward, done, steps, total_loss = dqn_episode(copy.deepcopy(env), agent, replay,
                                                            max_steps=2 * len(plan), evaluation=True)

        if done: return iterations, steps / len(plan) # learning episodes, solution quality
        #agent.exploration.anneal()

    return False

In [None]:
runs = 5 # runs per env
settings = [EGreedy(epsilon=0.5), EGreedy(epsilon=0.25), Boltzmann(tau=0.5), Boltzmann(tau=0.25), UCB1()]
stats = []

for e in select([2] * 3): # test on 2 envs for every box number
    results = []
    env, plan, _ = sokoban_test_small.get(e)
    print('ff ratio', len(plan) / len(search_astar(env)))

    # collect data
    for i in range(runs):
        print(e, i)

        for setting in copy.deepcopy(settings):
            results.append(evaluate_e(env, plan, setting))
            print(results[-1])


    # aggregate data, number of iterations until evaluation solve and path quality (solution / ff)
    for i in range(len(settings)):
        mode = results[i::len(settings)]

        for j in range(2):
            stats.append([x[j] for x in mode])
            print(stats[-1])


with open('data/test_exploration.pkl', 'wb') as f:
    pickle.dump(stats, f)

In [None]:
with open('data/test_exploration.pkl', 'rb') as f:
    stats = pickle.load(f)

indizes = select([2] * 3)
settings = len(stats) // len(indizes)


# plot results
cell = 0
for i in range(len(indizes)):
    # images
    cell += 1
    plt.subplot(3, 4, cell)
    plt.imshow(sokoban_test_small.get(indizes[i])[0].get_image(mode='rgb_array'))
    plt.axis('off')


    cell += 1
    ax1 = plt.subplot(3, 4, cell)
    ax2 = ax1.twiny()
    plt.yticks([])
    plt.ylim(-1, 5)
    y = np.arange(settings / 2)[::-1]

    # iterations
    x, std = [], []
    for s in range(i * settings, i * settings + settings, 2):
        x.append(np.average(stats[s]))
        std.append(np.std(stats[s]))

    ax1.errorbar(x, y, xerr=std, linestyle='None', marker='o', capsize=3, color='C0')
    ax1.spines['bottom'].set_color('C0')

    # quality
    x, std = [], []
    for s in range(i * settings, i * settings + settings, 2):
        x.append(np.average(stats[s + 1]))
        std.append(np.std(stats[s + 1]))

    ax2.errorbar(x, y, xerr=std, linestyle='None', marker='+', capsize=0, color='C1')
    ax2.spines['top'].set_color('C1')

plt.tight_layout(0.2)
plt.savefig('data/test_exploration.png', bbox_inches='tight', dpi=200)
plt.show()


# print latex table
for i in range(settings // 2):
    line = []
    for b in range(3): # boxes
        iterations = []
        quality = []
        for j in range(2): # 2 envs per box
            index = (b * 2 + j) * settings + 2 * i
            iterations += stats[index]
            quality += stats[index + 1]

        output = '& {} & {} ({}) & {} ({})'
        print(output.format(b + 1, np.round(np.average(iterations), 2), np.round(np.std(iterations), 2),
                            np.round(np.average(quality), 2), np.round(np.std(quality), 2)))
    print()

<h2>Network</h2>

In [None]:
# load evaluation envs

with open('data/sokoban_train.pkl', 'rb') as f:
    sokoban_problems = pickle.load(f)

split = int(0.1 * len(sokoban_problems.all_envs))
sokoban_problems.all_envs = sokoban_problems.all_envs[:split]

In [None]:
# average trained networks

if True:
    model, _ = create()
    model.set_weights(np.array(model.get_weights()) * 0)
    models = ['2020-07-12_03-54-48', '2020-07-10_17-34-19', '2020-07-10_19-12-07']
    for i in models:
        model_temp, _ = create()
        model_temp.load_weights('models/il_action_exploration/{}.h5'.format(i))
        model.set_weights(np.array(model.get_weights()) + 1/len(models) * np.array(model_temp.get_weights()))
    model.save_weights('models/il_action_exploration.h5')
else:
    model, _ = create(activation_out='linear')
    model.set_weights(np.array(model.get_weights()) * 0)
    models = ['2020-07-09_06-55-58', '2020-07-11_06-40-27', '2020-07-11_17-51-45']
    for i in models:
        model_temp, _ = create()
        model_temp.load_weights('models/rl_qaction_dqn/{}.h5'.format(i))
        model.set_weights(np.array(model.get_weights()) + 1/len(models) * np.array(model_temp.get_weights()))
    model.save_weights('models/rl_qaction_dqn.h5')

print(evaluate(model, sokoban_problems))

In [None]:
# evaluate networks greedily

all_solved = []

# policy network
model, _ = create()
for net in ['models/il_action.h5', 'models/il_action_exploration.h5', 'models/rl_action_ppo.h5']:
    stats = []
    model.load_weights(net)
    print(net, evaluate(model, sokoban_problems, stats=stats))
    all_solved.append(stats)

# q-value network
model, _ = create(activation_out='linear')
for net in ['models/il_qaction.h5', 'models/rl_qaction_dqn.h5']:
    stats = []
    model.load_weights(net)
    print(net, evaluate(model, sokoban_problems, stats=stats))
    all_solved.append(stats)


# analyze policy correlation
policy_a, policy_b = 1, 4

policy_a = all_solved[policy_a]
policy_b = all_solved[policy_b]
values = [[0,0,0] for _ in range(3)]
for i in policy_a:
    index = sokoban_problems.all_envs[i][0].num_boxes - 1
    if i in policy_b: values[1][index] +=1
    else: values[0][index] += 1

for i in policy_b:
    if i not in policy_a:
        index = sokoban_problems.all_envs[i][0].num_boxes - 1
        values[2][index] += 1


print(values) # [[11, 66, 73], [347, 229, 141], [2, 12, 22]]

barWidth = 0.25
labels = ['only policy', 'both', 'only q-value']
boxes = np.arange(3)
for i in range(3):
    plt.bar(boxes + i * barWidth, values[i], width=barWidth, color='C{}'.format(i), edgecolor='white', label=labels[i])

plt.xticks(boxes + barWidth, ['1 Box', '2 Boxes', '3 Boxes'])

plt.legend()
plt.savefig('images/policy_correlation.png', bbox_inches='tight', dpi=200)
plt.show()

In [None]:
# evaluate combined value + policy performance

model_p, _ = create()
model_p.load_weights('models/il_action_exploration.h5')

model_v, _ = create(activation_out='linear')
model_v.load_weights('models/rl_qaction_dqn.h5')

class combined_model():
    def predict(data):
        return 0.5 * (model_p.predict(data) + softmax(model_v.predict(data), tau=0.1))

print(evaluate(combined_model, sokoban_problems))

In [None]:
# policy entropy

model, _ = create()
model.load_weights('models/il_action_exploration.h5')

#model, _ = create(activation_out='linear')
#model.load_weights('models/rl_qaction_dqn.h5')

for env, plan in sokoban_problems.all_envs:
    trace = []
    entropy = []
    env = copy.deepcopy(env)

    for j in range(2 * len(plan)):
        env_h = hash(str(strip(env)))
        if env_h in trace: break
        else: trace.append(env_h)

        env_b = binary(strip(env))[0]
        policy = model.predict(np.expand_dims(env_b, axis=0))[0]
        #policy = softmax(policy, tau=0.1)
        entropy.append(-sum(policy * np.log(policy + 1e-10)))

        action = np.argmax(policy) # best predicted action
        _, reward, done, info = env.step(action_translator[action], 'tiny_rgb_array')

        if done: break
    if not done:
        plt.plot(np.arange(len(entropy)), entropy, color='C0' if done else 'C1')

plt.show()

In [None]:
# evalute policy accuracy

model, _ = create()
model.load_weights('models/il_action_exploration.h5')

accuracy = np.zeros((3,2))
for i in range(len(sokoban_problems.all_envs)):
    env, plan, _ = sokoban_problems.get(i)
    for action in plan:
        output = model.predict(np.expand_dims(binary(strip(env))[0], axis=0))[0]
        if np.argmax(output) == action:
            accuracy[env.num_boxes - 1][0] += 1
        accuracy[env.num_boxes - 1][1] += 1
        if env.step(action_translator[action], 'tiny_rgb_array')[2]: break

print(accuracy[:, 0] / accuracy[:, 1])

In [None]:
# evaluate length heuristic with astar

Config.early_stop = False


model = {}
evaluate_s(lambda env: search_astar(env, model, cutoff=1e3, w=1.0), sokoban_problems)

model['heuristic'] = Sokoban
evaluate_s(lambda env: search_astar(env, model, cutoff=1e3, w=1.0), sokoban_problems)

model['heuristic'] = create(shape_out=1, activation_out='linear')[0]
for net in ['models/il_length.h5', 'models/il_length_mae.h5', 'models/il_length_exploration.h5']:
    model['heuristic'].load_weights(net)
    evaluate_s(lambda env: search_astar(env, model, cutoff=1e3, w=1.0), sokoban_problems)
    evaluate_s(lambda env: search_astar(env, model, cutoff=1e3, w=3.0), sokoban_problems)

In [None]:
# evaluate network and search combinations

Config.early_stop = True


model = {}
if True:
    model['policy'] = create()[0]
    model['policy'].load_weights('models/il_action_exploration.h5')
if False:
    model['policy'] = combined_model
if False:
    model['value'] = create()[0]
    model['value'].load_weights('models/rl_qaction_dqn.h5')
if True:
    model['heuristic'] = create(shape_out=1, activation_out='linear')[0]
    model['heuristic'].load_weights('models/il_length_exploration.h5')

evaluate_s(lambda env: search_astar(env, model, cutoff=1e10, w=3.0), sokoban_problems)
#evaluate_s(lambda env: search_mcts_improved(env, model, cutoff=1e3), sokoban_problems)