In [1]:
import keras
from keras import layers, models
from keras import backend as K
import numpy as np
import gym

from RubiksEnv import RubiksEnv

from numba import jit, uint8, float32

from rl.memory import SequentialMemory
from rl.policy import BoltzmannQPolicy, LinearAnnealedPolicy, EpsGreedyQPolicy
from rl.agents import DQNAgent

Using TensorFlow backend.


In [2]:
@jit(float32(uint8[:, :, :, :]),nopython=True)
def rubiks_hamilton_dist(cube):
    x, y, z = cube.shape[:-1]
    distance_corner = float(0.0)
    distance_edge = float(0.0)
    
    distance_rot = float(0.0)
    
    
    for i in range(x):
        for j in range(y):
            for k in range(z):
                cubie = cube[i,j,k]
                dist_rot = float(0.0)
                
                if np.any(cubie == 5):
                    i_true = 0
                    dist_rot += cubie[0] != 5
                elif np.any(cubie == 3):
                    i_true = 2
                    dist_rot += cubie[0] != 3
                else:
                    i_true = 1
                    
                if np.any(cubie == 4):
                    j_true = 0
                    dist_rot += cubie[1] != 4
                elif np.any(cubie == 2):
                    j_true = 2
                    dist_rot += cubie[1] != 2
                else:
                    j_true = 1
                    
                if np.any(cubie == 1):
                    k_true = 0
                    dist_rot += cubie[2] != 1
                elif np.any(cubie == 6):
                    k_true = 2
                    dist_rot += cubie[2] != 6
                else:
                    k_true = 1
                
                distance = abs(i - i_true) + abs(j - j_true) + abs(k - k_true)
                
                num_zero = (cubie == 0).sum()
                if (num_zero == 0):
                    distance_corner += distance
                elif (num_zero == 1):
                    distance_edge += distance
                    
                distance_rot += dist_rot / float(3.0)
                    
    return 1 / (max(distance_corner, distance_edge) + distance_rot + 1)

In [3]:
DIFFICULTY = 4
env = RubiksEnv(rubiks_hamilton_dist, scrambles=DIFFICULTY, max_step=DIFFICULTY)
nb_actions = env.action_space.n

In [4]:
INPUT_SHAPE = env.observation_space.shape
WINDOW_LENGTH = 1

In [5]:
model = keras.models.Sequential()
model.add(keras.layers.Reshape(INPUT_SHAPE, input_shape=(WINDOW_LENGTH,) + INPUT_SHAPE))

# Mutli Conv
# model.add(keras.layers.Convolution3D(filters=8, kernel_size=(1, 1, 1), strides=(1, 1, 1), activation='elu'))
# model.add(keras.layers.Convolution3D(filters=16, kernel_size=(1, 1, 3), strides=(1, 1, 1), padding='same', activation='elu'))
# model.add(keras.layers.Convolution3D(filters=16, kernel_size=(1, 3, 1), strides=(1, 1, 1), padding='same', activation='elu'))
# model.add(keras.layers.Convolution3D(filters=16, kernel_size=(3, 1, 1), strides=(1, 1, 1), padding='same', activation='elu'))
# model.add(keras.layers.Convolution3D(filters=16, kernel_size=(1, 1, 1), strides=(1, 1, 1), activation='elu'))
# model.add(keras.layers.Flatten())

# model.add(keras.layers.Dense(32, activation='elu'))
# model.add(keras.layers.Dense(16, activation='elu'))
# model.add(keras.layers.Dense(nb_actions, activation='linear'))

model.add(layers.Conv3D(8, kernel_size=(1, 1, 1), strides=(1, 1, 1), activation='elu'))
model.add(layers.Conv3D(64, kernel_size=(3, 3, 3), strides=(1, 1, 1), activation='elu'))
model.add(layers.Flatten())

model.add(layers.Dense(64, activation='elu'))
model.add(layers.Dense(nb_actions, activation='linear'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
reshape_1 (Reshape)          (None, 3, 3, 3, 3)        0         
_________________________________________________________________
conv3d_1 (Conv3D)            (None, 3, 3, 3, 8)        32        
_________________________________________________________________
conv3d_2 (Conv3D)            (None, 1, 1, 1, 64)       13888     
_________________________________________________________________
flatten_1 (Flatten)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                4160      
_________________________________________________________________
dense_2 (Dense)              (None, 12)                780       
Total params: 18,860
Trainable params: 18,860
Non-trainable params: 0
_________________________________________________________________


In [6]:
memory = SequentialMemory(limit=50000, window_length=WINDOW_LENGTH)
processor = None

In [7]:
policy = LinearAnnealedPolicy(EpsGreedyQPolicy(), attr='eps', value_max=0.9, value_min=.05, value_test=.05, nb_steps=500000)

In [8]:
agent = DQNAgent(model=model, nb_actions=nb_actions, policy=policy, memory=memory, nb_steps_warmup=50000, gamma=.99, target_model_update=0.8, train_interval=4, enable_dueling_network=True)

In [9]:
agent.compile(keras.optimizers.Adam())

In [10]:
DIFFICULTY = 2
env = RubiksEnv(rubiks_hamilton_dist, scrambles=DIFFICULTY, max_step=DIFFICULTY)

agent.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
2516 episodes - episode_reward: 0.150 [0.091, 1.199]

Interval 2 (10000 steps performed)
2513 episodes - episode_reward: 0.149 [0.088, 1.199]

Interval 3 (20000 steps performed)
2516 episodes - episode_reward: 0.152 [0.091, 1.213]

Interval 4 (30000 steps performed)
2515 episodes - episode_reward: 0.152 [0.090, 1.199]

Interval 5 (40000 steps performed)
2518 episodes - episode_reward: 0.152 [0.085, 1.199]

Interval 6 (50000 steps performed)
2576 episodes - episode_reward: 0.212 [0.091, 1.213] - loss: 0.189 - mean_q: 1.604 - mean_eps: 0.807

Interval 7 (60000 steps performed)
2570 episodes - episode_reward: 0.216 [0.093, 1.213] - loss: 0.186 - mean_q: 1.622 - mean_eps: 0.790

Interval 8 (70000 steps performed)
2542 episodes - episode_reward: 0.195 [0.090, 1.213] - loss: 0.122 - mean_q: 1.318 - mean_eps: 0.773

Interval 9 (80000 steps performed)
2544 episodes - episode_reward: 0.197 [0.085, 1.213] - loss: 0.078 - mean_q: 1.072 

3520 episodes - episode_reward: 0.861 [0.095, 1.213] - loss: 0.031 - mean_q: 0.965 - mean_eps: 0.229

Interval 41 (400000 steps performed)
3451 episodes - episode_reward: 0.849 [0.096, 1.213] - loss: 0.030 - mean_q: 0.976 - mean_eps: 0.212

Interval 42 (410000 steps performed)
3543 episodes - episode_reward: 0.877 [0.096, 1.213] - loss: 0.029 - mean_q: 0.982 - mean_eps: 0.195

Interval 43 (420000 steps performed)
3578 episodes - episode_reward: 0.877 [0.093, 1.213] - loss: 0.029 - mean_q: 0.986 - mean_eps: 0.178

Interval 44 (430000 steps performed)
3523 episodes - episode_reward: 0.884 [0.103, 1.213] - loss: 0.030 - mean_q: 0.995 - mean_eps: 0.161

Interval 45 (440000 steps performed)
3611 episodes - episode_reward: 0.916 [0.100, 1.213] - loss: 0.028 - mean_q: 0.994 - mean_eps: 0.144

Interval 46 (450000 steps performed)
3748 episodes - episode_reward: 0.950 [0.100, 1.213] - loss: 0.027 - mean_q: 0.997 - mean_eps: 0.127

Interval 47 (460000 steps performed)
3760 episodes - episode_rew

<keras.callbacks.History at 0x7f095df575c0>

In [12]:
DIFFICULTY = 3
env = RubiksEnv(rubiks_hamilton_dist, scrambles=DIFFICULTY, max_step=DIFFICULTY)
agent.policy.value_max = 0.6

agent.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
2212 episodes - episode_reward: 0.363 [0.106, 1.257] - loss: 0.016 - mean_q: 0.995 - mean_eps: 0.592

Interval 2 (10000 steps performed)
2216 episodes - episode_reward: 0.378 [0.098, 1.257] - loss: 0.017 - mean_q: 0.972 - mean_eps: 0.584

Interval 3 (20000 steps performed)
2236 episodes - episode_reward: 0.390 [0.096, 1.271] - loss: 0.020 - mean_q: 0.944 - mean_eps: 0.573

Interval 4 (30000 steps performed)
2210 episodes - episode_reward: 0.400 [0.103, 1.257] - loss: 0.023 - mean_q: 0.917 - mean_eps: 0.562

Interval 5 (40000 steps performed)
2228 episodes - episode_reward: 0.391 [0.095, 1.257] - loss: 0.026 - mean_q: 0.894 - mean_eps: 0.551

Interval 6 (50000 steps performed)
2261 episodes - episode_reward: 0.410 [0.108, 1.257] - loss: 0.029 - mean_q: 0.867 - mean_eps: 0.540

Interval 7 (60000 steps performed)
2234 episodes - episode_reward: 0.409 [0.102, 1.257] - loss: 0.033 - mean_q: 0.845 - mean_eps: 0.529

Interval 8 (700

2679 episodes - episode_reward: 0.773 [0.109, 1.271] - loss: 0.034 - mean_q: 0.950 - mean_eps: 0.177

Interval 40 (390000 steps performed)
2742 episodes - episode_reward: 0.807 [0.111, 1.271] - loss: 0.034 - mean_q: 0.948 - mean_eps: 0.166

Interval 41 (400000 steps performed)
2782 episodes - episode_reward: 0.818 [0.114, 1.271] - loss: 0.034 - mean_q: 0.949 - mean_eps: 0.155

Interval 42 (410000 steps performed)
2791 episodes - episode_reward: 0.854 [0.111, 1.271] - loss: 0.034 - mean_q: 0.953 - mean_eps: 0.144

Interval 43 (420000 steps performed)
2841 episodes - episode_reward: 0.856 [0.100, 1.271] - loss: 0.033 - mean_q: 0.960 - mean_eps: 0.133

Interval 44 (430000 steps performed)
2753 episodes - episode_reward: 0.801 [0.108, 1.257] - loss: 0.033 - mean_q: 0.965 - mean_eps: 0.122

Interval 45 (440000 steps performed)
2866 episodes - episode_reward: 0.858 [0.113, 1.257] - loss: 0.033 - mean_q: 0.967 - mean_eps: 0.111

Interval 46 (450000 steps performed)
2836 episodes - episode_rew

<keras.callbacks.History at 0x7f78906e11d0>

In [13]:
DIFFICULTY = 4
env = RubiksEnv(rubiks_hamilton_dist, scrambles=DIFFICULTY, max_step=DIFFICULTY)

agent.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
1727 episodes - episode_reward: 0.275 [0.117, 1.331] - loss: 0.033 - mean_q: 0.961 - mean_eps: 0.592

Interval 2 (10000 steps performed)
1760 episodes - episode_reward: 0.308 [0.118, 1.348] - loss: 0.032 - mean_q: 0.939 - mean_eps: 0.584

Interval 3 (20000 steps performed)
1742 episodes - episode_reward: 0.294 [0.118, 1.330] - loss: 0.032 - mean_q: 0.915 - mean_eps: 0.573

Interval 4 (30000 steps performed)
1749 episodes - episode_reward: 0.313 [0.120, 1.320] - loss: 0.032 - mean_q: 0.887 - mean_eps: 0.562

Interval 5 (40000 steps performed)
1753 episodes - episode_reward: 0.322 [0.123, 1.348] - loss: 0.033 - mean_q: 0.858 - mean_eps: 0.551

Interval 6 (50000 steps performed)
1767 episodes - episode_reward: 0.343 [0.124, 1.320] - loss: 0.032 - mean_q: 0.825 - mean_eps: 0.540

Interval 7 (60000 steps performed)
1769 episodes - episode_reward: 0.340 [0.124, 1.334] - loss: 0.032 - mean_q: 0.804 - mean_eps: 0.529

Interval 8 (700

2102 episodes - episode_reward: 0.738 [0.122, 1.334] - loss: 0.031 - mean_q: 0.869 - mean_eps: 0.177

Interval 40 (390000 steps performed)
2094 episodes - episode_reward: 0.736 [0.125, 1.334] - loss: 0.030 - mean_q: 0.871 - mean_eps: 0.166

Interval 41 (400000 steps performed)
2027 episodes - episode_reward: 0.674 [0.131, 1.348] - loss: 0.031 - mean_q: 0.873 - mean_eps: 0.155

Interval 42 (410000 steps performed)
2076 episodes - episode_reward: 0.718 [0.118, 1.331] - loss: 0.031 - mean_q: 0.877 - mean_eps: 0.144

Interval 43 (420000 steps performed)
2117 episodes - episode_reward: 0.747 [0.132, 1.334] - loss: 0.030 - mean_q: 0.884 - mean_eps: 0.133

Interval 44 (430000 steps performed)
2133 episodes - episode_reward: 0.749 [0.125, 1.330] - loss: 0.030 - mean_q: 0.887 - mean_eps: 0.122

Interval 45 (440000 steps performed)
2145 episodes - episode_reward: 0.768 [0.123, 1.334] - loss: 0.030 - mean_q: 0.884 - mean_eps: 0.111

Interval 46 (450000 steps performed)
2151 episodes - episode_rew

<keras.callbacks.History at 0x7f78906e10b8>

In [16]:
DIFFICULTY = 5
env = RubiksEnv(rubiks_hamilton_dist, scrambles=DIFFICULTY, max_step=DIFFICULTY)
agent.policy.value_max = 0.3
agent.fit(env, nb_steps=500000)

Training for 500000 steps ...
Interval 1 (0 steps performed)
1599 episodes - episode_reward: 0.492 [0.140, 1.375] - loss: 0.029 - mean_q: 0.810 - mean_eps: 0.296

Interval 2 (10000 steps performed)
1569 episodes - episode_reward: 0.450 [0.138, 1.375] - loss: 0.029 - mean_q: 0.824 - mean_eps: 0.293

Interval 3 (20000 steps performed)
1573 episodes - episode_reward: 0.457 [0.136, 1.375] - loss: 0.029 - mean_q: 0.812 - mean_eps: 0.288

Interval 4 (30000 steps performed)
1571 episodes - episode_reward: 0.438 [0.139, 1.374] - loss: 0.029 - mean_q: 0.811 - mean_eps: 0.283

Interval 5 (40000 steps performed)
1592 episodes - episode_reward: 0.493 [0.139, 1.375] - loss: 0.029 - mean_q: 0.790 - mean_eps: 0.278

Interval 6 (50000 steps performed)
1598 episodes - episode_reward: 0.483 [0.136, 1.393] - loss: 0.029 - mean_q: 0.780 - mean_eps: 0.273

Interval 7 (60000 steps performed)
1569 episodes - episode_reward: 0.442 [0.137, 1.393] - loss: 0.029 - mean_q: 0.789 - mean_eps: 0.268

Interval 8 (700

<keras.callbacks.History at 0x7f78906e18d0>

In [17]:
agent.save_weights("Model_Multi_Conv")