In [1]:
import keras
import numpy as np
import gym

from RubiksEnv import RubiksEnv

from numba import jit, uint8, float32

from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.agents import DQNAgent

Using TensorFlow backend.


In [2]:
@jit(float32(uint8[:, :, :, :]),nopython=True)
def rubiks_hamilton_dist(cube):
    x, y, z = cube.shape[:-1]
    distance_corner = float(0.0)
    distance_edge = float(0.0)
    
    distance_rot = float(0.0)
    
    
    for i in range(x):
        for j in range(y):
            for k in range(z):
                cubie = cube[i,j,k]
                dist_rot = float(0.0)
                
                if np.any(cubie == 5):
                    i_true = 0
                    dist_rot += cubie[0] != 5
                elif np.any(cubie == 3):
                    i_true = 2
                    dist_rot += cubie[0] != 3
                else:
                    i_true = 1
                    
                if np.any(cubie == 4):
                    j_true = 0
                    dist_rot += cubie[1] != 4
                elif np.any(cubie == 2):
                    j_true = 2
                    dist_rot += cubie[1] != 2
                else:
                    j_true = 1
                    
                if np.any(cubie == 1):
                    k_true = 0
                    dist_rot += cubie[2] != 1
                elif np.any(cubie == 6):
                    k_true = 2
                    dist_rot += cubie[2] != 6
                else:
                    k_true = 1
                
                distance = abs(i - i_true) + abs(j - j_true) + abs(k - k_true)
                
                num_zero = (cubie == 0).sum()
                if (num_zero == 0):
                    distance_corner += distance
                elif (num_zero == 1):
                    distance_edge += distance
                    
                distance_rot += dist_rot / float(3.0)
                    
    return 1 / (max(distance_corner, distance_edge) + distance_rot + 1)

In [3]:
env = RubiksEnv(rubiks_hamilton_dist, scrambles=2, max_step=2)
nb_actions = env.action_space.n

In [4]:
INPUT_SHAPE = env.observation_space.shape
WINDOW_LENGTH = 1

In [10]:
model = keras.models.Sequential()

model.add(keras.layers.Reshape(INPUT_SHAPE, input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))
model.add(keras.layers.Convolution3D(filters=32, kernel_size=(2, 2, 2), strides=(1, 1, 1), activation='elu'))
model.add(keras.layers.Flatten())

model.add(keras.layers.Dense(32, activation='elu'))
model.add(keras.layers.Dense(16, activation='elu'))
model.add(keras.layers.Dense(nb_actions, activation='linear'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_2 (Reshape)          (None, 3, 3, 3, 3)        0         
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 2, 2, 2, 32)       800       
_________________________________________________________________
flatten_2 (Flatten)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 32)                8224      
_________________________________________________________________
dense_5 (Dense)              (None, 16)                528       
_________________________________________________________________
dense_6 (Dense)              (None, 12)                204       
Total params: 9,756
Trainable params: 9,756
Non-trainable params: 0
_________________________________________________________________


In [11]:
memory = SequentialMemory(limit=20000, window_length=WINDOW_LENGTH)
processor = None

In [12]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0.8, value_min=.05, value_test=.05,
                              nb_steps=40000)

In [13]:
agent = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=4000, gamma=.99, target_model_update=0.8, train_interval=4)

In [14]:
agent.compile(keras.optimizers.Adam())

In [15]:
agent.fit(env, nb_steps=100000)

Training for 100000 steps ...
Interval 1 (0 steps performed)
2546 episodes - episode_reward: 0.186 [0.090, 1.213] - loss: 0.044 - mean_q: 0.675 - mean_q: 0.675 - mean_eps: 0.669

Interval 2 (10000 steps performed)
 1124/10000 [==>...........................] - ETA: 83s - reward: 0.0939done, took 81.662 seconds


<keras.callbacks.History at 0x7f4934265a90>

In [12]:
agent.test(env, nb_episodes=20)

Testing for 20 episodes ...
Episode 1: reward: 0.236, steps: 4
Episode 2: reward: 1.077, steps: 2
Episode 3: reward: 1.077, steps: 2
Episode 4: reward: 0.156, steps: 4
Episode 5: reward: 1.077, steps: 2
Episode 6: reward: 1.158, steps: 4
Episode 7: reward: 1.213, steps: 4
Episode 8: reward: 1.213, steps: 4
Episode 9: reward: 1.077, steps: 2
Episode 10: reward: 1.077, steps: 2
Episode 11: reward: 1.158, steps: 4
Episode 12: reward: 0.203, steps: 4
Episode 13: reward: 1.077, steps: 2
Episode 14: reward: 1.151, steps: 4
Episode 15: reward: 1.077, steps: 2
Episode 16: reward: 1.077, steps: 2
Episode 17: reward: 0.199, steps: 4
Episode 18: reward: 1.077, steps: 2
Episode 19: reward: 1.077, steps: 2
Episode 20: reward: 1.077, steps: 2


<keras.callbacks.History at 0x7f45e0ebab00>

In [18]:
agent.model.predict(np.reshape(cube, (1,1,3,3,3,3))).argmax()

7

In [19]:
env._rotations[7](cube)

In [20]:
cube == env._base_cube

array([[[[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]],


       [[[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]],


       [[[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]],

        [[ True,  True,  True],
         [ True,  True,  True],
         [ True,  True,  True]]]], dtype=bool)

In [73]:
rubiks_hamilton_dist(cube)

-7.5