In [None]:
%matplotlib inline

import os
import copy
import math
import pickle
import datetime
import gym
import gym_sokoban
import tensorflow as tf

from utils import *
from astar import *
from replay import *
from network import *
from generator import *

from manage import *
startSession()

logs_base_dir = "logs"

<a href="http://localhost:6006/#scalars" target="_blank">TensorBoard</a>

In [None]:
# launch tensorboard
%load_ext tensorboard

os.makedirs(logs_base_dir, exist_ok=True)
%tensorboard --logdir {logs_base_dir}

In [None]:
# generate train Sokoban environments

sokoban_train = Sokoban(count=10000)
with open('data/sokoban_train.pkl', 'wb') as f:
    pickle.dump(sokoban_train, f)

In [None]:
# load generated train environments
# upgrade numpy to > 1.17 when problems loading

with open('data/sokoban_train.pkl', 'rb') as f:
    sokoban_train = pickle.load(f)
split = int(0.1 * len(sokoban_train.all_envs)) # validation, train split

In [None]:
# analyse ff / improved environment solutions

if False:
    improved_train = []
    improved_validation = []
    for i in tqdm(range(split, len(sokoban_train.all_envs))):
        env, plan, _ = sokoban_train.get(i)
        plan_ff = solve(env)
        if plan_ff: ratio = len(plan) / len(plan_ff)
        else: continue

        if ratio == 1: # search optimal solution
            plan_opt = search_astar(env, cutoff=10000)
            if plan_opt:
                ratio_optimal = len(plan_opt) / len(plan_ff)
                if 0 < ratio_optimal < 1: print(i, ratio_optimal)

        if ratio < 1: # improved by policy
            if i < split: improved_validation += [ratio]
            else: improved_train += [ratio]

    print(len(improved_validation) / split, len(improved_train) / (len(sokoban_train.all_envs) - split)) # 0.372 0.389


s = [11, 113, 141, 152, 223, 356, 465, 580, 855] # improved validation envs
s = [1203, 1834, 3603, 3124, 5906, 7009, 7293, 7531, 7660, 7830, 8080, 8276, 8372, 9479] # improved train envs
s = [65, 1017, 1023, 1024] # bad unimproved solutions

for i in s:
    env, plan, _ = sokoban_train.get(i)
    plan_ff = solve(env)
    plan_as = search_astar(env, cutoff=10000)

    print(i, len(plan), len(plan) / len(plan_ff))
    print(i, len(plan_as), len(plan_as) / len(plan_ff))

    print([list(action_dict.keys())[action] for action in plan])
    print([list(action_dict.keys())[action] for action in plan_ff])
    print([list(action_dict.keys())[action] for action in plan_as])
    draw(env, 'images/sol_{}.png'.format(i))

<h2>Imitation Learning</h2>

In [None]:
# improve ff training solution pathes using learned policy

model, _ = create()
model.load_weights('models/il_action.h5')

improved = 0
for i in tqdm(range(len(sokoban_train.all_envs))):
    plan_learned = []
    env, plan, _ = sokoban_train.get(i)

    for _ in range(len(plan) - 1):
        env_b = binary(strip(env))[0]
        action = np.argmax(model.predict(np.expand_dims(env_b, axis=0))[0]) # best predicted action
        plan_learned.append(action)
        if env.step(action_translator[action], 'tiny_rgb_array')[2]: # better solution found
            improved += 1
            print(i, len(plan_learned) / len(plan))
            sokoban_train.all_envs[i] = (sokoban_train.all_envs[i][0], plan_learned) # save new solution
            break
    print(plan_learned)

print("envs improved", improved, improved / len(sokoban_train.all_envs))

with open('data/sokoban_train.pkl', 'wb') as f:
    pickle.dump(sokoban_train, f)

In [None]:
# create learning-data

imitation_data = {
    'state': [],
    'action': [],
    'length': [],
    'state_val': [],
    'action_val': [],
    'length_val': [],
}

print('Validation envs 0 -', split)

# walk env solution pathes
max_length = 40 # longer solutions are very sparse
length_index = [[] for _ in range(max_length)]
for i in tqdm(range(len(sokoban_train.all_envs))):
    validation = i < split
    env, plan, _ = sokoban_train.get(i)
    plan_length = len(plan)
    for action in plan:
        if plan_length <= max_length: # only save plan up to max length
            action_hot = np.eye(4)[action]
            if validation:
                imitation_data['state_val'].append(binary(strip(env))[0])
                imitation_data['action_val'].append(action_hot)
                imitation_data['length_val'].append(plan_length)
            else:
                length_index[plan_length - 1].append((strip(env), action_hot))

        done = env.step(action_translator[action], 'tiny_rgb_array')[2]
        plan_length -= 1
    if not done: print("solving error")
    else: assert(done and plan_length == 0)

# oversample to balance length
state_count = len(length_index[0])
for i in tqdm(range(max_length)):
    for j in range(state_count):
        x = j % len(length_index[i])

        imitation_data['state'].append(length_index[i][x][0])
        imitation_data['action'].append(length_index[i][x][1])
        imitation_data['length'].append(i + 1)

for i in ['state_val', 'action_val', 'length_val']:
    imitation_data[i] = np.array(imitation_data[i])

with open('data/imitation_data.pkl', 'wb') as f:
    pickle.dump(imitation_data, f)

In [None]:
# create data for search

with open('data/imitation_data.pkl', 'rb') as f:
    imitation_data = pickle.load(f)

state = []
action = []
length = []

sample_count = len(imitation_data['state'])
for i in range(4 * sample_count):
    x = i % sample_count
    s = binary(imitation_data['state'][x], shrink=True, random=True, size=(13, 13))[0]
    s, a = augment(s, imitation_data['action'][x])[:2]
    l = imitation_data['length'][x]
    state.append(s)
    action.append(a)
    length.append(l)

state = np.array(state)
action = np.array(action)
length = np.array(length)


# make train samples unique
state, indices = np.unique(state, axis=0, return_index=True)
print('unique ratio:', len(state) / len(action))
action = action[indices]
length = length[indices]


print('action distribution:', np.sum(action, axis=0) / len(action))
print('length distribution:', np.unique(length, return_counts=True)[1])
print('length distribution val:', np.unique(imitation_data['length_val'], return_counts=True)[1])


file = 'data/imitation_search_{}.npy'
np.save(file.format('state'), state)
np.save(file.format('action'), action)
np.save(file.format('length'), length)

In [None]:
# plot HpBandster search results

def capacity(network, shape_out, shape_in=None):
    if shape_in is None: shape_in = [13, 13, 4]
    c = 0
    for l in network[0]: # conv layer
        c += ((l[1] ** 2) * shape_in[2] + 1) * l[0]
        shape_in[2] = l[0]
        if len(l) > 2: # max pooling
            shape_in[0] = math.ceil((shape_in[0] + 1) / l[2])
            shape_in[1] = math.ceil((shape_in[1] + 1) / l[2])
    flat = shape_in[0] * shape_in[1] * shape_in[2]
    for l in network[1]: # dense layer
        c += l * flat + l
        flat = l
    c += shape_out * flat + shape_out
    return c


with open('data/search_action.pkl', 'rb') as f:
    search_results = pickle.load(f)

    # lowest loss
    search_results.sort(key = lambda entry: entry[0])
    lowest = search_results[0]
    print('Lowest loss:', lowest)

    search_results.sort(key = lambda entry: capacity(entry[3], 4)) # order by network capacity

    with open('data/search_action.csv', 'w') as c:
        c.write('iteration loss acc\n')
        for i in range(len(search_results)):
            c.write('{} {} {}\n'.format(i, search_results[i][1], search_results[i][2]))

    x = np.arange(len(search_results))
    plt.axvline(search_results.index(lowest), color='grey', lw=3, alpha=0.5)
    plt.scatter(x, [search_results[i][1] for i in x], color='red', marker='.', label='val loss')
    plt.scatter(x, [search_results[i][2] for i in x], color='green', marker='.', label='val acc')
    plt.xlabel('network')
    plt.legend()
    plt.savefig('data/search_action.png', bbox_inches='tight', dpi=200)
    plt.show()


with open('data/search_length.pkl', 'rb') as f:
    search_results = pickle.load(f)

    # lowest loss
    search_results.sort(key = lambda entry: entry[0])
    lowest = search_results[0]
    print('Lowest loss:', lowest)

    search_results.sort(key = lambda entry: capacity(entry[3], 1)) # order by network capacity
    with open('data/search_length.csv', 'w') as c:
        c.write('iteration loss\n')
        for i in range(len(search_results)):
            c.write('{} {}\n'.format(i, search_results[i][1]))

    x = np.arange(len(search_results))
    plt.axvline(search_results.index(lowest), color='grey', lw=3, alpha=0.5)
    plt.scatter(x, [search_results[i][1] for i in x], color='red', marker='.', label='val loss')
    plt.xlabel('network')
    plt.legend()
    plt.savefig('data/search_length.png', bbox_inches='tight', dpi=200)
    plt.show()

In [None]:
# build data generators

with open('data/imitation_data.pkl', 'rb') as f:
    imitation_data = pickle.load(f)

train_action_generator = Generator(imitation_data['state'], imitation_data['action'])
train_length_generator = Generator(imitation_data['state'], imitation_data['length'])

state_val = imitation_data['state_val']
action_val = imitation_data['action_val']
length_val = imitation_data['length_val']

In [None]:
# train action network

# tensorboard
logdir = os.path.join(logs_base_dir, "il-{}".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir)

models= (([(29, 6), (24, 4), (26, 8), (16, 5), (22, 4, 2), (23, 7)], [14]),
         ([(16, 6), (15, 6), (10, 4), (6, 9), (7, 7, 2), (8, 8), (11, 8), (10, 9)], []),
         ([(16, 5), (15, 7), (6, 6), (16, 4), (7, 4), (6, 8), (6, 9), (16, 5)], [83]),
         ([(20, 6), (14, 4), (32, 6), (15, 5), (31, 7, 2), (9, 4)], [55]),
         ([(15, 3), (15, 7, 2), (11, 7, 2), (8, 7, 2), (9, 5)], []),
         ([(15, 8), (6, 7), (13, 9), (14, 7, 2), (12, 5), (4, 8)], [219, 59]))
model, _ = create(model=models[0])
plot(model, 'models/il_action.png')

history = model.fit_generator(generator=train_action_generator, validation_data=(state_val, action_val),
                              use_multiprocessing=True, workers=2, epochs=30, callbacks=[tensorboard_callback]).history
model.save_weights('models/il_action.h5')


# save and plot history
with open('models/il_action_loss.pkl', 'wb') as f:
    pickle.dump(history, f)
with open('models/il_action_loss.csv', 'w') as f:
    f.write('epoch acc acc_val loss loss_val\n')
    for i in range(len(history['loss'])):
        f.write('{} {} {} {} {}\n'.format(i, history['categorical_accuracy'][i],
                                          history['val_categorical_accuracy'][i],
                                          history['loss'][i], history['val_loss'][i]))

x = np.arange(len(history['loss']))
plt.plot(x, history['categorical_accuracy'], color='green', label='acc') 
plt.plot(x, history['val_categorical_accuracy'], color='green', linestyle='dashed', label='val acc')
plt.plot(x, history['loss'], color='red', label='loss')
plt.plot(x, history['val_loss'], color='red', linestyle='dashed', label='val loss')
plt.xlabel('epoch')
plt.legend()
plt.savefig('models/il_action_loss.png', bbox_inches='tight', dpi=200)
plt.show()

In [None]:
# train length network

# tensorboard
logdir = os.path.join(logs_base_dir, "il-{}".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir)

models = (([(28, 5), (28, 5), (9, 6), (32, 9, 2), (10, 3, 2), (31, 5, 2), (10, 5, 2), (10, 8, 2)], []),
          ([(26, 3), (23, 6), (20, 7), (29, 8, 2), (30, 5, 2), (31, 7), (10, 5)], []),
          ([(14, 3), (9, 7), (14, 6), (13, 8), (5, 4), (16, 9), (4, 3), (15, 7)], [49]),
          ([(8, 3), (16, 9), (10, 7), (10, 3)], [217]),
          ([(9, 3), (11, 7), (14, 8), (9, 4), (15, 8), (12, 9), (13, 8)], [42]),
          ([(14, 9), (8, 8), (7, 4), (11, 8), (13, 6), (12, 8), (3, 5)], [226]),
          ([(11, 7), (11, 9), (5, 4), (9, 7), (8, 8), (8, 8), (1, 9), (4, 7)], [216, 248, 45]))
model, _ = create(model=models[1], shape_out=1, activation_out='linear', loss='custom')
plot(model, 'models/il_length.png')

history = model.fit_generator(generator=train_length_generator, validation_data=(state_val, length_val),
                              use_multiprocessing=True, workers=2, epochs=30, callbacks=[tensorboard_callback]).history
model.save_weights('models/il_length.h5')


# save and plot history
with open('models/il_length_loss.pkl', 'wb') as f:
    pickle.dump(history, f)
with open('models/il_length_loss.csv', 'w') as f:
    f.write('epoch loss loss_val\n')
    for i in range(len(history['loss'])):
        f.write('{} {} {}\n'.format(i, history['loss'][i], history['val_loss'][i]))

x = np.arange(len(history['loss']))
plt.plot(x, history['loss'], color='red', label='loss')
plt.plot(x, history['val_loss'], color='red', linestyle='dashed', label='val loss')
plt.xlabel('epoch')
plt.legend()
plt.savefig('models/il_length_loss.png', bbox_inches='tight', dpi=200)
plt.show()

<h3>Imitation Learning with exploration</h3>

In [None]:
DAgger = False

model_best = -np.inf
model, _ = create()
model.load_weights('models/il_action_exploration.h5')

model_length, _ = create(shape_out=1, activation_out='linear', loss='mean_absolute_error')
model_length.load_weights('models/il_length_exploration.h5')

stats = []
replay = Replay()

indizes = np.arange(split, len(sokoban_train.all_envs))
for _ in range(4):
    np.random.shuffle(indizes)
    for r in range(len(indizes)):
        # evaluate policy
        if r % 200 == 0:
            stats.append(evaluate(model, sokoban_train, split))
            print(r, stats[-1])
            if np.average(stats[-1][1]) >= model_best: # save better model
                model.save_weights('models/il_action_exploration.h5')
                model_length.save_weights('models/il_length_exploration.h5')
                model_best = np.average(stats[-1][1])
                print('saved model', model_best)

            with open('models/il_action_exploration.pkl', 'wb') as f: # save stats
                pickle.dump(stats, f)


        # explore
        trace_state = []
        trace_action = []
        total_reward = 0
        env, plan, i = sokoban_train.get(indizes[r])
        for _ in range(int(len(plan) * 1.5)):
            trace_state.append(copy.deepcopy(env))
            state = binary(strip(env))[0]

            probs = model.predict(np.expand_dims(state, axis=0))[0]
            action = np.argmax(np.random.choice(probs, p=probs) == probs) # sample action
            trace_action.append(action)

            _, reward, done, info = env.step(action_translator[action], 'tiny_rgb_array')
            total_reward += Sokoban.reward(reward, info)
            if done: break


        # create learning data
        trace_state.reverse()
        trace_action.reverse()
        plan_length = 0 if done else None
        for e in range(len(trace_state)):
            plan = solve(trace_state[e])
            if done or plan: # solvable state
                if plan_length is None: plan_length = len(plan)
                else: plan_length += 1

                if done and plan_length < len(plan): # policy solution is better
                    replay.add(strip(trace_state[e]), np.eye(4)[trace_action[e]], reward=plan_length)
                else: # add ff solution
                    env = trace_state[e]
                    for action in range(len(plan)):
                        replay.add(strip(env), np.eye(4)[plan[action]], reward=len(plan) - action)
                        if DAgger: break # only add states from trace
                        solved = env.step(action_translator[plan[action]], 'tiny_rgb_array')[2]
                    if not solved and not DAgger: print('ERROR ff plan')


        # improve policy
        if replay.full() > 1/4:
            for _ in range(100):
                data = replay.sample()
                model.train_on_batch(data[0], data[1])
                model_length.train_on_batch(data[0], data[3])

plot_stats(stats, 'models/il_action_exploration.png')

<h3>Pre-train q-values</h3>

In [None]:
# pre-generate dqn learning data

imitation_qvalues = {
    'state': [],
    'qvalue': [],
    'state_val': [],
    'qvalue_val': [],
}

for i in tqdm(range(len(sokoban_train.all_envs))):
    env, plan, _ = sokoban_train.get(i)

    for action in plan[:-1]:
        q_values = [None] * 4
        for a in range(len(q_values)):
            env_a = copy.deepcopy(env)
            _, reward, done, info = env_a.step(action_translator[a], 'tiny_rgb_array')
            q_values[a] = Sokoban.reward(reward, info)

            if done: continue

            solution = solve(env_a)
            if solution:
                for x in range(len(solution)):
                    _, reward, done, info = env_a.step(action_translator[solution[x]], 'tiny_rgb_array')
                    q_values[a] += (Config.discount**(x + 1)) * Sokoban.reward(reward, info)
            else: q_values[a] -= 10 # unsolvable punishment


        if i < split: # validation data
            imitation_qvalues['state_val'].append(binary(strip(env))[0])
            imitation_qvalues['qvalue_val'].append(q_values)
        else:
            imitation_qvalues['state'].append(strip(env))
            imitation_qvalues['qvalue'].append(q_values)

        env.step(action_translator[action])


for i in ['state_val', 'qvalue_val']:
    imitation_qvalues[i] = np.array(imitation_qvalues[i])

with open('data/imitation_qvalues.pkl', 'wb') as f:
    pickle.dump(imitation_qvalues, f)

In [None]:
# load data
with open('data/imitation_qvalues.pkl', 'rb') as f:
    imitation_qvalues = pickle.load(f)

train_qvalue_generator = Generator(imitation_qvalues['state'], imitation_qvalues['qvalue'])

state_val = imitation_qvalues['state_val']
qvalue_val = imitation_qvalues['qvalue_val']


# tensorboard
logdir = os.path.join(logs_base_dir, "il-{}".format(datetime.datetime.now().strftime("%Y%m%d-%H%M%S")))
tensorboard_callback = tf.keras.callbacks.TensorBoard(logdir)

# train network
model, _ = create(activation_out='linear')

history = model.fit_generator(generator=train_qvalue_generator, validation_data=(state_val, qvalue_val),
                              use_multiprocessing=True, workers=2, epochs=30, callbacks=[tensorboard_callback]).history
model.save_weights('models/il_qaction.h5')


# save and plot history
with open('models/il_qaction_loss.pkl', 'wb') as f:
    pickle.dump(history, f)
with open('models/il_qaction_loss.csv', 'w') as f:
    f.write('epoch loss loss_val\n')
    for i in range(len(history['loss'])):
        f.write('{} {} {}\n'.format(i, history['loss'][i], history['val_loss'][i]))

x = np.arange(len(history['loss']))
plt.plot(x, history['loss'], color='red', label='loss')
plt.plot(x, history['val_loss'], color='red', linestyle='dashed', label='val loss')
plt.xlabel('epoch')
plt.legend()
plt.savefig('models/il_qaction_loss.png', bbox_inches='tight', dpi=200)
plt.show()

<h2>Reinforcement Learning</h2>

In [None]:
from dqn import *
from ppo import *
from exploration import *

<h3>DQN</h3>

In [None]:
agent_best = -np.inf
agent = DQNAgent()
agent.load('models/rl_qaction_dqn.h5')

stats = []
replay = Replay()

indizes = np.arange(split, len(sokoban_train.all_envs))
for _ in range(4):
    np.random.shuffle(indizes)
    for r in range(len(indizes)):
        # evaluate policy
        if r % 200 == 0:
            stats.append(evaluate(agent.Q, sokoban_train, split))
            print(r, stats[-1])
            if sum(stats[-1][0]) >= agent_best: # save better model
                agent.save('models/rl_qaction_dqn.h5')
                agent_best = sum(stats[-1][0])
                print('saved agent', agent_best)

            with open('models/rl_qaction_dqn.pkl', 'wb') as f: # save stats
                pickle.dump(stats, f)


        # explore
        train = replay.full() > 1/4
        env, plan, i = sokoban_train.get(indizes[r])
        for j in range(5):
            total_reward, done, steps, _ = dqn_episode(copy.deepcopy(env), agent, replay,
                                                       max_steps=int(len(plan) * 1.5), train=train)

            # add ff transitions
            if not done and j % 5 == 0: dqn_episode(copy.deepcopy(env), agent, replay, path=plan, train=train)

            if done: break

plot_stats(stats, 'models/rl_qaction_dqn.png')

<h3>PPO</h3>

In [None]:
agent_best = -np.inf
agent = PPOAgent()
agent.load('models/il_action_exploration.h5')

stats = []
replay = Replay(augment=False)

indizes = np.arange(split, len(sokoban_train.all_envs))
for _ in range(8):
    np.random.shuffle(indizes)
    for r in range(len(indizes)):
        # evaluate policy
        if r % 500 == 0:
            stats.append(evaluate(agent.actor, sokoban_train, split))
            print(r, stats[-1])
            if np.average(stats[-1][1]) >= agent_best: # save better model
                agent.save('models/rl_action_ppo.h5')
                agent_best = np.average(stats[-1][1])
                print('saved agent', agent_best)

            with open('models/rl_action_ppo.pkl', 'wb') as f: # save stats
                pickle.dump(stats, f)


        # explore
        env, plan, i = sokoban_train.get(indizes[r])
        for j in range(5):
            total_reward, done, steps = ppo_episode(copy.deepcopy(env), agent, replay, max_steps=int(len(plan) * 1.5))

            stats.append((total_reward, done, steps, env.num_boxes, i))
            print(r, stats[-1])

            if done: break


        # update policy
        if r > 0 and r % 5 == 0:
            for _ in range(10):
                data = replay.sample(256)
                loss_value, loss_policy = agent.update(data)
            replay.reset()

plot_stats(stats, 'models/rl_action_ppo.png')