In [12]:
pip install tensorflow==2.0

Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install gym

Note: you may need to restart the kernel to use updated packages.


In [14]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [15]:
pip install pyglet==1.4.9

Note: you may need to restart the kernel to use updated packages.


In [16]:
import gym
from gym import spaces
import numpy as np
import random
import keras
from collections import deque
from keras import backend as back
back.image_data_format()
from keras.models import Model, Sequential, load_model
from keras.optimizers import RMSprop
from keras.layers import Dense, merge, Lambda, Input, Add, Conv2D, LeakyReLU, Flatten, Multiply, Reshape

In [22]:
class WaterSortPuzzleEnv(gym.Env):
    def __init__(self):
        # 10 tube environment
        
        self.num_tubes = 10
        self.num_colors = 8
        self.max_colors_tube = 4
        
        self.observation_space = spaces.MultiDiscrete([self.num_colors] * self.num_tubes)
        self.action_space = spaces.Tuple((
            spaces.Discrete(self.num_tubes),  # Source tube
            spaces.Discrete(self.num_tubes)   # Target tube
        ))
        self.initial_state()
        self.done = False
        
    def reset(self):
        if self.done:
            reward = 1
        elif self.is_puzzle_unsolvable():
            reward = -1
        else:
            reward = 0
        self.state = self.initial_state()
        self.done = False
        
        return self.state, reward, self.done, {}
    
    def step(self, action):
        source_tube, target_tube = action
        
        if source_tube == target_tube:
            return self.state, 0, self.done, {}
        
        self.pour(source_tube, targer_tube)
        
        self.done = self.is_puzzle_solved()
        
        if self.is_puzzle_unsolvable():
            self.done = True
        
        if self.done:
            reward = 1
        else:
            reward = 0
            
        return self.state, reward, self.done, {}
    
    def render(self, mode='human'):
        print(self.state)
    
    def initial_state(self):
        state = np.zeros(self.num_tubes, dtype=int)
        color_pool = np.random.choice(self.num_colors, self.max_colors_tube, replace=False)
        
        for i in range(self.num_tubes):
            if i < self.num_tubes // 2:
                state[i] = color_pool[i % self.max_colors_tube]
        
        return state
    
    def is_puzzle_solved(self):
        for color in range(1, self.num_colors+1):
            color_pos = np.where(self.state == color)[0]
            
            if len(color_pos) >0 and len(color_positions)!= self.max_colors_tube:
                return False
                 
        return 
        
    def is_puzzle_unsolvable(self):
        for color in range(1, self.num_colors+1):
            color_pos = np.where(self.state == color)[0]
            
            if len(color_pos) < self.max_colors_tube:
                empty_pos = np.where(self.state == 0)[0]
                
                if len(empty_pos) < (self.max_colors_tube - len(color_pos)):
                    return True
        return False
    
    def pour(self, source_tube, target_tube):
        source_color = self.state[source_tube]
        empy_pos = np.where(self.state == 0)[0]
        
        for pos in empty_pos:
            self.state[pos] = source_color
        
        self.state[source_tube] = 0
    
    def is_tube_empty(self, tube):
        return np.count_nonzero(self.state == self.state[tube]) == 1
    
    def is_tube_full(self, tube):
        return np.count_nonzero(self.state == self.state[tube]) == self.max_colors_tube
    
    

In [23]:
class WaterPuzzleDQN:
    
    cp_env = WaterSortPuzzleEnv()
    
    def __init__(self, sizeOfState, sizeOfAction):
        
        self.sizeOfState = sizeOfState
        self.sizeOfAction = sizeOfAction
        
        self.gamma = .99
        self.minEpsilon = .1
        self.maxEpsilon = 1
        self.decay = .99
        self.batchSize = 32
        self.training = 50000
        self.mem = deque(maxlen=1000000)
        self.model = self.ddqnModel()
        self.target = self.ddqnModel()
        
    def ddqnModel(self):
        
        # create, seperate then aggregate layers for qvalue
        model = Sequential()
        model.add(Conv2D(32, (8, 8), strides=(4, 4), activation='relu', input_shape=self.sizeOfState))
        model.add(Conv2D(64, (4, 4), strides=(2, 2), activation='relu'))
        model.add(Conv2D(64, (3, 3), strides=(1, 1), activation='relu'))
        model.add(Flatten())
        model.add(Dense(512, activation='relu'))
        model.add(Dense(self.sizeOfAction))
        model.compile(loss="mse", optimizer=RMSprop(lr=0.00025, epsilon=0.01))
        model.summary()
        return model
    
    # update target model
    def updateModel(self):
        self.target.set_weights(self.model.get_weights())
        return
    
    # get action - either explore or exploit
    def actions(self, s):
        s = np.reshape(s, (-1, 84, 84, 4))
        # Exploration vs Exploitation
        if np.random.random()<= self.maxEpsilon :
            return random.randrange(self.sizeOfAction)
        else:
            return np.argmax(self.model.predict(s))
        
    # store experience in memory
    def memReplay(self, s, a, r, ns, done):
        s = np.reshape(s, (-1, 84, 84, 4))
        ns = np.reshape(ns, (-1, 84, 84, 4))
        self.mem.append((s, a, r, ns, done))
        
        if len(self.mem) > self.training:
            if self.maxEpsilon > self.minEpsilon:
                self.maxEpsilon *= self.decay

                
    # use that experience to train
    def repSample(self):
        
        # get random sample for manibatch
        miniBatch = random.sample(self.mem, self.batchSize)

        current = np.zeros((self.batchSize, self.sizeOfState[0],self.sizeOfState[1],self.sizeOfState[2]))
        ns = np.zeros((self.batchSize, self.sizeOfState[0],self.sizeOfState[1],self.sizeOfState[2]))
        a, r, done = [], [], []

        for x in range(self.batchSize):
            current[x] = miniBatch[x][0]
            a.append(miniBatch[x][1])
            r.append(miniBatch[x][2])
            ns[x] = miniBatch[x][3]
            done.append(miniBatch[x][4])
        
        # get target and next target
        #print(current.shape)
        tar = self.model.predict(current)
        nt = self.model.predict(ns)

        for x in range(len(miniBatch)):
            if done[x]:
                tar[x][a[x]] = r[x]
            else:
                # formula
                tar[x][a[x]] = r[x] + self.gamma * (np.amax(nt[x]))

        self.model.fit(current, tar, batch_size=self.batchSize, verbose=0)