In [2]:
import random
import numpy as np
import pandas as pd
class MinesweeperEnv(object):
    def __init__(self, width, height, n_mines,
        # based on https://github.com/jakejhansen/minesweeper_solver
        #rewards={'win':1, 'lose':-1, 'progress':0.3, 'guess':-0.3, 'no_progress' : -0.3}):
        rewards={'win':1, 'lose':0, 'bomb':-0.5, 'progress':0.3, 'guess':-0.3, 'no_progress' : -0.3}):
        self.nrows, self.ncols = width, height
        self.ntiles = self.nrows * self.ncols
        self.n_mines = n_mines
        self.grid = self.init_grid()
        self.board = self.get_board()
        self.state, self.state_im = self.init_state()
        self.n_clicks = 0
        self.n_progress = 0
        self.n_wins = 0

        self.rewards = rewards

    def init_grid(self):
        board = np.zeros((self.nrows, self.ncols), dtype='object')
        mines = self.n_mines

        while mines > 0:
            row, col = random.randint(0, self.nrows-1), random.randint(0, self.ncols-1)
            if board[row][col] != 'B':
                board[row][col] = 'B'
                mines -= 1

        return board

    def get_neighbors(self, coord):
        x,y = coord[0], coord[1]

        neighbors = []
        for col in range(y-1, y+2):
            for row in range(x-1, x+2):
                if ((x != row or y != col) and
                    (0 <= col < self.ncols) and
                    (0 <= row < self.nrows)):
                    neighbors.append(self.grid[row,col])

        return np.array(neighbors)

    def is_guess(self, state, coord):
        x,y = coord[0], coord[1]

        neighbors = []
        for col in range(y-1, y+2):
            for row in range(x-1, x+2):
                if ((x != row or y != col) and
                    (0 <= col < self.ncols) and
                    (0 <= row < self.nrows)):
                    neighbors.append(state[row, col])
        neighbors = np.array(neighbors)
        return all(t==-0.125 for t in neighbors)

    def count_bombs(self, coord):
        neighbors = self.get_neighbors(coord)
        return np.sum(neighbors=='B')

    def possibility_bombs(self, coord):
        neighbors = self.get_neighbors(coord)
        rand = np.random.random() # random value b/w 0 & 1
        if np.sum(neighbors=='B') == 0:
            if rand < 0.1: # possibility of mine
                ret = 1
            else:
                ret = 0
        else:
            ret = 1
        return ret

    def get_board(self):
        '''
        This board is revised.
        Number 1 means there is a mine around or more.
        Number 0 means there are no mines around.
        '''
        board = self.grid.copy()

        coords = []
        for x in range(self.nrows):
            for y in range(self.ncols):
                if self.grid[x,y] != 'B':
                    coords.append((x,y))

        for coord in coords:
            #board[coord] = self.count_bombs(coord)
            board[coord] = self.possibility_bombs(coord)

        return board

    def get_state_im(self, state):
        '''
        Gets the numeric image representation state of the board.
        This is what will be the input for the DQN.
        
        Meanings: 'U': Unknown i.e. not opened
                  'B': Mine
        '''

        state_im = [t['value'] for t in state]
        state_im = np.reshape(state_im, (self.nrows, self.ncols, 1)).astype(object)

        state_im[state_im=='U'] = -1
        state_im[state_im=='B'] = -2

        state_im = state_im.astype(np.int8) / 8
        state_im = state_im.astype(np.float16)

        return state_im

    def init_state(self):
        unsolved_array = np.full((self.nrows, self.ncols), 'U', dtype='object')

        state = []
        for (x, y), value in np.ndenumerate(unsolved_array):
            state.append({'coord': (x, y), 'value':value})

        state_im = self.get_state_im(state)

        return state, state_im

    def color_state(self, value):
        if value == -1:
            color = 'white'
        elif value == 0:
            color = 'slategrey'
        elif value == 1:
            color = 'blue'
        elif value == 2:
            color = 'green'
        elif value == 3:
            color = 'red'
        elif value == 4:
            color = 'midnightblue'
        elif value == 5:
            color = 'brown'
        elif value == 6:
            color = 'aquamarine'
        elif value == 7:
            color = 'black'
        elif value == 8:
            color = 'silver'
        else:
            color = 'magenta'

        return f'color: {color}'

    def draw_state(self, state_im):
        state = state_im * 8.0
        state_df = pd.DataFrame(state.reshape((self.nrows, self.ncols)), dtype=np.int8)

        display(state_df.style.applymap(self.color_state))

    def click(self, action_index):
        coord = self.state[action_index]['coord']
        value = self.board[coord]

        """
        # ensure first move is not a bomb
        if (value == 'B') and (self.n_clicks == 0):
            grid = self.grid.reshape(1, self.ntiles)
            move = np.random.choice(np.nonzero(grid!='B')[1])
            coord = self.state[move]['coord']
            value = self.board[coord]
            self.state[move]['value'] = value
        else:
            # make state equal to board at given coordinates
            self.state[action_index]['value'] = value
        """
        # make state equal to board at given coordinates
        self.state[action_index]['value'] = value

        """
        # reveal all neighbors if value is 0
        if value == 0.0:
            self.reveal_neighbors(coord, clicked_tiles=[])
        """

        self.n_clicks += 1

    def reveal_neighbors(self, coord, clicked_tiles):
        processed = clicked_tiles
        state_df = pd.DataFrame(self.state)
        x,y = coord[0], coord[1]

        neighbors = []
        for col in range(y-1, y+2):
            for row in range(x-1, x+2):
                if ((x != row or y != col) and
                    (0 <= col < self.ncols) and
                    (0 <= row < self.nrows) and
                    ((row, col) not in processed)):

                    # prevent redundancy for adjacent zeros
                    processed.append((row,col))

                    index = state_df.index[state_df['coord'] == (row,col)].tolist()[0]

                    self.state[index]['value'] = self.board[row, col]

                    # recursion in case neighbors are also 0
                    if self.board[row, col] == 0.0:
                        self.reveal_neighbors((row, col), clicked_tiles=processed)

    def reset(self):
        self.n_clicks = 0
        self.n_progress = 0
        self.grid = self.init_grid()
        self.board = self.get_board()
        self.state, self.state_im = self.init_state()

    def step(self, action_index):
        done = False
        coords = self.state[action_index]['coord']

        current_state = self.state_im

        # get neighbors before action
        is_guess_b = self.is_guess(current_state, coords)

        self.click(action_index)

        # update state image
        new_state_im = self.get_state_im(self.state)
        self.state_im = new_state_im

        board_flatten = self.board.flatten()
        unsolved_index = np.where(self.state_im.flatten() == -0.125)
        if np.array_equal(np.where(board_flatten == 'B'), 
                            unsolved_index): # if win
            #elif np.sum(new_state_im==-0.125) == self.n_mines: # if win
            reward = self.rewards['win']
            done = True
            self.n_progress += 1
            self.n_wins += 1
        
        elif (np.sum(self.state_im == -0.125) == 0) or \
                (np.sum(board_flatten[unsolved_index] != 'B') == 0): # if lose
            reward = self.rewards['lose']
            done = True

        # Does not lose if detect mine
        elif self.state[action_index]['value']=='B': # if find mines
            #reward = self.rewards['lose']
            reward = self.rewards['bomb']
            #done = True
        
        elif np.sum(self.state_im == -0.125) == np.sum(current_state == -0.125):
            reward = self.rewards['no_progress']

        else: # if progress
            if is_guess_b: # if guess (all neighbors are unsolved)
                reward = self.rewards['guess']

            else:
                reward = self.rewards['progress']
                self.n_progress += 1 # track n of non-isoloated clicks

        return self.state_im, reward, done

In [3]:
#Base code was written by Jonas Busk - Modified to suit project by Jacob Jon Hansen
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.nn import relu, softmax
import gym
import pickle
from sklearn.preprocessing import normalize
import pandas as pd
import random
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sys
import os
sys.path.append('../')

Instructions for updating:
non-resource variables are not supported in the long term


In [9]:
def stateConverter(OUT, state):
        """ Converts 2d state to one-hot encoded 3d state
            input: state (rows x cols)
            output: state3d (row x cols x 10) (if full)
                            (row x cols x 2) (if condensed)
                            (row x cols x 1) (if image)

        """
        rows, cols = state.shape
        if OUT == "FULL":
            res = np.zeros((rows,cols,10), dtype = int)
            for i in range(0,8):
                res[:,:,i] = state == i+1 #1-7
            res[:,:,8] = state == 'U'
            res[:,:,9] = state == 'E'
           
            return(res)
        elif OUT == "CONDENSED":
            #Outputs a condensed representation of nxmx2
            #First layer is the value of the intergers
            #Second layer is true if field is empty, 0 otherwise

            res = np.zeros((rows, cols, 2))
            filtr = ~np.logical_or(state == "U", state == "B") #Not U or E
            res[filtr,0] = state[filtr] / 4
            res[state == "U", 1] = 1
            return(res)

        elif OUT == "IMAGE":
            #Outputs an image
            res = np.zeros((rows, cols,1))
            res[state == "U", 0] = -1
            res[state == "E", 0] = 0
            filtr = ~np.logical_or(state == "U", state == "E") #Not U or E
            res[filtr, 0] = state[filtr] / 8
            return(res)

In [10]:
import math
def get_state(state):
  n = int(math.sqrt(len(state)))
  new_state = np.zeros((n,n), dtype=object)
  for s in state:
    x, y = s['coord']
    value = s['value']
    new_state[x][y] = value
  return new_state

In [22]:
#Base code was written by Jonas Busk - Modified to suit project by Jacob Jon Hansen
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.nn import relu, softmax
import gym
import pickle
from sklearn.preprocessing import normalize
import pandas as pd
import random
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

import sys
import os
sys.path.append('../')
# from minesweeper_tk import Minesweeper


model = "condensed_6x6_CNN"
# training settings

epochs = 500000 # number of training batches
batch_size = 200 # number of timesteps in a batch
rollout_limit = 50 # max rollout length
discount_factor = 0 # reward discount factor (gamma), 1.0 = no discount
learning_rate = 0.000002  # you know this by now #0.001, 
                                               #5600: 78% win --> LR: 0.0001
                                               #6801: 87% win --> LR: 0.00002
                                                
early_stop_loss = 0 # stop training if loss < early_stop_loss, 0 or False to disable

""" condensed
epochs = 100000 # number of training batches
batch_size = 400 # number of timesteps in a batch
rollout_limit = 50 # max rollout length
discount_factor = 0 # reward discount factor (gamma), 1.0 = no discount
learning_rate = 0.00004  # you know this by now #0.0005
early_stop_loss = 0 # stop training if loss < early_stop_loss, 0 or False to disable
"""

""" 261 epocs to learn 2 specific board (overfit)
epochs = 10000 # number of training batches
batch_size = 200 # number of timesteps in a batch
rollout_limit = 130 # max rollout length
discount_factor = 0 # reward discount factor (gamma), 1.0 = no discount
learning_rate = 0.001 # you know this by now
early_stop_loss = 0 # stop training if loss < early_stop_loss, 0 or False to disable

"""


# setup policy network
n = 6
n_inputs = 6*6*2
n_hidden = 6*6*8
n_hidden2 = 220
n_hidden3 = 220
n_hidden4 = 220
n_outputs = 6*6

dropout = 0.25

tf.reset_default_graph()

states_pl = tf.placeholder(tf.float32, [None, n_inputs], name='states_pl')
actions_pl = tf.placeholder(tf.int32, [None, 2], name='actions_pl')
advantages_pl = tf.placeholder(tf.float32, [None], name='advantages_pl')
learning_rate_pl = tf.placeholder(tf.float32, name='learning_rate_pl')

input_layer = tf.reshape(states_pl, [-1, n, n, 2])
conv1 = tf.layers.conv2d(inputs=input_layer,filters=18,kernel_size=[5, 5],padding="same", activation=tf.nn.relu)
conv2 = tf.layers.conv2d(inputs=conv1,filters=36,kernel_size=[3, 3],padding="same", activation=tf.nn.relu)
conv2_flat = tf.layers.flatten(conv2)
l_hidden = tf.layers.dense(inputs=conv2_flat, units=n_hidden, activation=relu, name='l_hidden')
l_hidden2 = tf.layers.dense(inputs=l_hidden, units=n_hidden2, activation=relu, name='l_hidden2')
l_hidden3 = tf.layers.dense(inputs=l_hidden2, units=n_hidden3, activation=relu, name='l_hidden3')
l_out = tf.layers.dense(inputs=l_hidden3, units=n_outputs, activation=softmax, name='l_out')

# print network
print('states_pl:', states_pl.get_shape())
print('actions_pl:', actions_pl.get_shape())
print('advantages_pl:', advantages_pl.get_shape())
print('l_hidden:', l_hidden.get_shape())
print('l_hidden2:', l_hidden2.get_shape())
print('l_hidden3:', l_hidden3.get_shape())
print('l_out:', l_out.get_shape())

# define loss and optimizer
loss_f = -tf.reduce_mean(tf.multiply(tf.log(tf.gather_nd(l_out, actions_pl)), advantages_pl))

optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_pl, beta1=0.8, beta2=0.92)
train_f = optimizer.minimize(loss_f)

saver = tf.train.Saver() # we use this later to save the model

# test forward pass
env = MinesweeperEnv(6, 6, 6)
OUT = "CONDENSED"

state = stateConverter(OUT, get_state(env.state)).flatten()
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    action_probabilities = sess.run(fetches=l_out, feed_dict={states_pl: [state]})
print(action_probabilities)

# helper functions

def get_rollout(sess, env, rollout_limit=None, stochastic=False, seed=None):
    """Generate rollout by iteratively evaluating the current policy on the environment."""
    rollout_limit = rollout_limit
    
    env.reset()
    s = stateConverter(OUT, get_state(env.state)).flatten()
    states, actions, rewards = [], [], []
    for i in range(rollout_limit):
        a = get_action(sess, s, stochastic)
        s1, r, done = env.step(a)
        s1 = stateConverter(OUT, get_state(env.state))
        s1 = s1.flatten()
        states.append(s)
        actions.append(a)
        rewards.append(r)
        s = s1
        if done: break
    return states, actions, rewards, i+1

def get_action(sess, state, stochastic=False):
    """Choose an action, given a state, with the current policy network."""
    a_prob = sess.run(fetches=l_out, feed_dict={states_pl: np.atleast_2d(state)})

    if stochastic:
        # sample action from distribution
        return (np.cumsum(np.asarray(a_prob)) > np.random.rand()).argmax()
    else:
        # select action with highest probability
        return a_prob.argmax()

def get_advantages(rewards, rollout_limit, discount_factor, eps=1e-12):
    """Compute advantages"""
    returns = get_returns(rewards, rollout_limit, discount_factor)
    # standardize columns of returns to get advantages
    advantages = (returns - np.mean(returns, axis=0)) / (np.std(returns, axis=0) + eps)
    # restore original rollout lengths
    advantages = [adv[:len(rewards[i])] for i, adv in enumerate(advantages)]
    return advantages

def get_returns(rewards, rollout_limit, discount_factor):
    """Compute the cumulative discounted rewards, a.k.a. returns."""
    returns = np.zeros((len(rewards), rollout_limit))
    for i, r in enumerate(rewards):
        returns[i, len(r) - 1] = r[-1]
        for j in reversed(range(len(r)-1)):
            returns[i,j] = r[j] + discount_factor * returns[i,j+1]
    return returns

def get_winrate(sess, env):
    games = 0
    moves = 0
    stuck = 0
    won_games = 0
    lost_games = 0
    r = 0
    while games < 1000:
        while True:
            s = stateConverter(OUT, get_state(env.state)).flatten()
            if r < 0: 
                a = get_action(sess, s, stochastic=True)
            else:
                a = get_action(sess, s, stochastic=False)
            moves += 1
            s, r, done= env.step(a)
            s = get_state(env.state)
            s = s.flatten()
            if r == 1:
                won_games += 1
            if r == 0:
                lost_games += 1

            if done:
                games += 1
                env.reset()
                moves = 0
                break
            elif moves >= 100:
                stuck += 1
                games += 1
                lost_games += 1
                env.reset()
                moves = 0
                break
    return(won_games/games)

def smooth(y,factor):
    if type(y)!=list:
        y = list(y)
    return pd.Series(y).rolling(window=factor).mean()#[factor:]
# train policy network

states_pl: (?, 72)
actions_pl: (?, 2)
advantages_pl: (?,)
l_hidden: (?, 288)
l_hidden2: (?, 220)
l_hidden3: (?, 220)
l_out: (?, 36)
[[0.02701008 0.02707972 0.02682858 0.02921804 0.02788166 0.02839976
  0.02749893 0.02766426 0.02806639 0.02761758 0.02899935 0.02750686
  0.02920808 0.02873861 0.0275104  0.02811758 0.02777183 0.02884548
  0.02626789 0.0289324  0.02742492 0.02603118 0.02705365 0.02793705
  0.02830545 0.02873754 0.02436297 0.02656362 0.02752407 0.02791508
  0.03058466 0.02737801 0.02673968 0.02865472 0.02759228 0.02803159]]


  if np.sum(neighbors=='B') == 0:


In [23]:
with tf.Session() as sess:
    stats = pickle.load(open("{}/stats.p".format(model), "rb"))
    saver.restore(sess, "{}/{}.ckpt".format(model,model))
    
    win_rate = get_winrate(sess, env)
    print(win_rate)

INFO:tensorflow:Restoring parameters from condensed_6x6_CNN/condensed_6x6_CNN.ckpt


  if np.sum(neighbors=='B') == 0:


0.0
