## Minesweeper solver using DQN
environment from [github.com/sdlee94/Minesweeper-AI-Reinforcement-Learning](https://github.com/sdlee94/Minesweeper-AI-Reinforcement-Learning)

일단 수렴성 확인하려고

In [None]:
import sys, os

if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    sys.path.append('/content/drive/My Drive/RL_Project')
    %cd /content/drive/My Drive/2022-1 RL/Project

%tensorflow_version 1.x

Mounted at /content/drive
[Errno 2] No such file or directory: '/content/drive/My Drive/2022-1 RL/Project'
/content
TensorFlow 1.x selected.


In [None]:
!pip install h5py==2.10.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting h5py==2.10.0
  Downloading h5py-2.10.0-cp37-cp37m-manylinux1_x86_64.whl (2.9 MB)
[K     |████████████████████████████████| 2.9 MB 5.0 MB/s 
Installing collected packages: h5py
  Attempting uninstall: h5py
    Found existing installation: h5py 3.1.0
    Uninstalling h5py-3.1.0:
      Successfully uninstalled h5py-3.1.0
Successfully installed h5py-2.10.0


In [None]:
import random
import pickle
from tqdm import tqdm
import warnings
from collections import deque

import numpy as np
import pandas as pd

import tensorflow as tf
from keras.models import Sequential
from keras.models import load_model
from keras.layers import Conv2D, Dense, Flatten
from keras.optimizers import Adam
from keras.callbacks import TensorBoard

from minesweeper_env import MinesweeperEnv

Using TensorFlow backend.


In [None]:
warnings.filterwarnings('ignore')
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

# Environment settings
MEM_SIZE = 50_000 # number of moves to store in replay buffer
MEM_SIZE_MIN = 1_000 # min number of moves in replay buffer

# Learning settings
BATCH_SIZE = 64
learn_rate = 0.01
LEARN_DECAY = 0.99975
LEARN_MIN = 0.001
DISCOUNT = 0.1 #gamma

# Exploration settings
epsilon = 0.95
EPSILON_DECAY = 0.99975
EPSILON_MIN = 0.01

# DQN settings 
CONV_UNITS = 256 # number of neurons in each conv layer # 여기 수정했다.
DENSE_UNITS = 512 # number of neurons in fully connected dense layer
UPDATE_TARGET_EVERY = 5

AGG_STATS_EVERY = 100 # calculate stats every 100 games for tensorboard
SAVE_MODEL_EVERY = 10_000 # save model and replay every 10,000 episodes

In [None]:
# Beginner: (9x9, 10) / Intermediate: (16x16, 40) / Expert: (16x30, 99)
width, height, n_mines = 9, 9, 10

# rewards = {'win': 1, 'lose': -1, 'progress': 0.3, 'guess': -0.3, 'no_progress': -0.3}
env = MinesweeperEnv(width, height, n_mines)

progress_list, wins_list, ep_rewards = [], [], []
n_clicks = 0
episodes = 100_000

In [None]:
class ModifiedTensorBoard(TensorBoard):

    # Overriding init to set initial step and writer (we want one log file for all .fit() calls)
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.step = 1
        self.writer = tf.summary.FileWriter(self.log_dir)

    # Overriding this method to stop creating default log writer
    def set_model(self, model):
        pass

    # Overrided, saves logs with our step number
    # (otherwise every .fit() will start writing from 0th step)
    def on_epoch_end(self, epoch, logs=None):
        self.update_stats(**logs)

    # Overrided
    # We train for one batch only, no need to save anything at epoch end
    def on_batch_end(self, batch, logs=None):
        pass

    # Overrided, so won't close writer
    def on_train_end(self, _):
        pass

    # Custom method for saving own metrics
    # Creates writer, writes custom metrics and closes writer
    def update_stats(self, **stats):
        self._write_logs(stats, self.step)

In [None]:
def dqn(learn_rate, input_dims, n_actions, conv_units, dense_units):
    model = Sequential([
                        Conv2D(conv_units, (3, 3), activation='relu', padding='same'), # input 제한 없앰
                        Conv2D(conv_units, (3, 3), activation='relu', padding='same'),
                        Conv2D(conv_units, (3, 3), activation='relu', padding='same'),
                        Conv2D(conv_units, (3, 3), activation='relu', padding='same'),
                        Flatten(),
                        Dense(dense_units, activation='relu'),
                        Dense(dense_units, activation='relu'),
                        Dense(n_actions, activation='linear'),
    ])
    model.compile(optimizer=Adam(lr=learn_rate, epsilon=1e-4), loss='mse')
    return model

In [None]:
class DQNAgent(object):
    def __init__(self, env, conv_units=64, dense_units=256):
        self.env = env

        # Deep Q-learning Parameters
        self.discount = DISCOUNT
        self.learn_rate = learn_rate
        self.epsilon = epsilon
        self.model = dqn(
            self.learn_rate, self.env.state_im.shape, self.env.ntiles, conv_units, dense_units)

        # target model - this is what we predict against every step
        self.target_model = dqn(
            self.learn_rate, self.env.state_im.shape, self.env.ntiles, conv_units, dense_units)
        self.target_model.set_weights(self.model.get_weights())

        self.replay_memory = deque(maxlen=MEM_SIZE)
        self.target_update_counter = 0

        self.tensorboard = ModifiedTensorBoard(log_dir=f'logs/DQN')

    def get_action(self, state):
        board = state.reshape(1, self.env.ntiles)
        unsolved = [i for i, x in enumerate(board[0]) if x==-0.125]

        rand = np.random.random() # random value b/w 0 & 1

        if rand < self.epsilon: # random move (explore)
            move = np.random.choice(unsolved)
        else:
            moves = self.model.predict(np.reshape(state, (1, self.env.nrows, self.env.ncols, 1)))
            moves[board!=-0.125] = np.min(moves) # set already clicked tiles to min value
            move = np.argmax(moves)

        return move

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def train(self, done):
        if len(self.replay_memory) < MEM_SIZE_MIN:
            return

        batch = random.sample(self.replay_memory, BATCH_SIZE)

        current_states = np.array([transition[0] for transition in batch])
        current_qs_list = self.model.predict(current_states)

        new_current_states = np.array([transition[3] for transition in batch])
        future_qs_list = self.target_model.predict(new_current_states)

        X,y = [], []

        for i, (current_state, action, reward, new_current_state, done) in enumerate(batch):
            if not done:
                max_future_q = np.max(future_qs_list[i])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            current_qs = current_qs_list[i]
            current_qs[action] = new_q

            X.append(current_state)
            y.append(current_qs)

        self.model.fit(np.array(X),
                       np.array(y),
                       batch_size=BATCH_SIZE,
                       shuffle=False,
                       verbose=0,
                       callbacks=[self.tensorboard] if done else None)

        # updating to determine if we want to update target_model yet
        if done:
            self.target_update_counter += 1

        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

        # decay learn_rate
        self.learn_rate = max(LEARN_MIN, self.learn_rate*LEARN_DECAY)

        # decay epsilon
        self.epsilon = max(EPSILON_MIN, self.epsilon*EPSILON_DECAY)

In [None]:
agent = DQNAgent(env, conv_units=CONV_UNITS, dense_units=DENSE_UNITS)
agent.model = tf.keras.models.load_model(f'/content/drive/My Drive/RL_Project/models/dqn.h5')

Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [None]:
# 모델 구조를 확인합니다
agent.model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            multiple                  1280      
_________________________________________________________________
conv2d_2 (Conv2D)            multiple                  147584    
_________________________________________________________________
conv2d_3 (Conv2D)            multiple                  147584    
_________________________________________________________________
conv2d_4 (Conv2D)            multiple                  147584    
_________________________________________________________________
flatten_1 (Flatten)          multiple                  0         
_________________________________________________________________
dense_1 (Dense)              multiple                  5308928   
_________________________________________________________________
dense_2 (Dense)              multiple                 

In [None]:
input()

In [None]:
for episode in tqdm(range(1, episodes+1), unit='episode'):
    agent.tensorboard.step = episode

    env.reset()
    episode_reward = 0
    past_n_wins = env.n_wins

    done = False
    while not done:
        current_state = env.state_im

        action = agent.get_action(current_state)

        new_state, reward, done = env.step(action)

        episode_reward += reward

        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done)

        n_clicks += 1

    progress_list.append(env.n_progress) # n of non-guess moves
    ep_rewards.append(episode_reward)

    if env.n_wins > past_n_wins:
        wins_list.append(1)
    else:
        wins_list.append(0)

    if len(agent.replay_memory) < MEM_SIZE_MIN:
        continue

    if not episode % AGG_STATS_EVERY:
        med_progress = round(np.median(progress_list[-AGG_STATS_EVERY:]), 2)
        # 어차피 100번마다 구하니 소수점 아래 3자리부터는 필요가 없다.
        win_rate = round(np.sum(wins_list[-AGG_STATS_EVERY:]) / AGG_STATS_EVERY, 2)
        med_reward = round(np.median(ep_rewards[-AGG_STATS_EVERY:]), 2)

        agent.tensorboard.update_stats(
            progress_med = med_progress,
            winrate = win_rate,
            reward_med = med_reward,
            learn_rate = agent.learn_rate,
            epsilon = agent.epsilon)

        print(f'Episode: {episode}, Median progress: {med_progress}, Median reward: {med_reward}, Win rate : {win_rate}')

    if not episode % SAVE_MODEL_EVERY:
        with open(f'/content/drive/My Drive/RL_Project/replay/dqn.pkl', 'wb') as output:
            pickle.dump(agent.replay_memory, output)

        agent.model.save(f'/content/drive/My Drive/RL_Project/models/dqn.h5')
        # 10000 episode마다 파일 저장
        f = open("/content/drive/My Drive/RL_Project/win.txt", 'w')
        f.write(str(env.n_wins))
        f.write(str(wins_list))
        f.close()

  0%|          | 2/100000 [00:06<84:46:19,  3.05s/episode]
ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-22e6bd5c9ef8>", line 12, in <module>
    action = agent.get_action(current_state)
  File "<ipython-input-8-c88f21a34635>", line 31, in get_action
    moves = self.model.predict(np.reshape(state, (1, self.env.nrows, self.env.ncols, 1)))
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/keras/engine/training.py", line 908, in predict
    use_multiprocessing=use_multiprocessing)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/keras/engine/training_arrays.py", line 723, in predict
    callbacks=callbacks)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/keras/engine/training_arrays.py", line 394, in model_iteration
    batch_outs = f(ins_batch)
  File "/tensorflow-1.15.2/python3.7/tensorflow_core/python/keras/backend.py", line 3476, in __cal

KeyboardInterrupt: ignored

바뀐 환경은 나중에 풉시다

In [None]:
# Beginner: (9x9, 10) / Intermediate: (16x16, 40) / Expert: (16x30, 99)
width, height, n_mines = 9, 9, 10

# rewards = {'win': 1, 'lose': -1, 'progress': 0.3, 'guess': -0.3, 'no_progress': -0.3}
env = MinesweeperEnv(width, height, n_mines)
agent = DQNAgent(env, conv_units=CONV_UNITS, dense_units=DENSE_UNITS)

progress_list, wins_list, ep_rewards = [], [], []
n_clicks = 0
episodes = 100_000

In [None]:
for episode in tqdm(range(1, episodes+1), unit='episode'):
    agent.tensorboard.step = episode

    env.reset()
    episode_reward = 0
    past_n_wins = env.n_wins

    done = False
    while not done:
        current_state = env.state_im

        print(current_state)
        print(agent.get_action(current_state))

        action = agent.get_action(current_state)

        new_state, reward, done = env.step(action)

        episode_reward += reward

        agent.update_replay_memory((current_state, action, reward, new_state, done))
        agent.train(done)

        n_clicks += 1

    progress_list.append(env.n_progress) # n of non-guess moves
    ep_rewards.append(episode_reward)

    if env.n_wins > past_n_wins:
        wins_list.append(1)
    else:
        wins_list.append(0)

    if len(agent.replay_memory) < MEM_SIZE_MIN:
        continue

    if not episode % AGG_STATS_EVERY:
        med_progress = round(np.median(progress_list[-AGG_STATS_EVERY:]), 2)
        # 어차피 100번마다 구하니 소수점 아래 3자리부터는 필요가 없다.
        win_rate = round(np.sum(wins_list[-AGG_STATS_EVERY:]) / AGG_STATS_EVERY, 2)
        med_reward = round(np.median(ep_rewards[-AGG_STATS_EVERY:]), 2)

        agent.tensorboard.update_stats(
            progress_med = med_progress,
            winrate = win_rate,
            reward_med = med_reward,
            learn_rate = agent.learn_rate,
            epsilon = agent.epsilon)

        print(f'Episode: {episode}, Median progress: {med_progress}, Median reward: {med_reward}, Win rate : {win_rate}')

    if not episode % SAVE_MODEL_EVERY:
        with open(f'/content/drive/My Drive/RL_Project/replay/dqn_custom.pkl', 'wb') as output:
            pickle.dump(agent.replay_memory, output)

        agent.model.save(f'/content/drive/My Drive/RL_Project/models/dqn_custom.h5')
        # 10000 episode마다 파일 저장
        f = open("/content/drive/My Drive/RL_Project/custom_win.txt", 'w')
        f.write(str(env.n_wins))
        f.write(str(wins_list))
        f.close()

In [None]:
agent.model.save(f'/content/drive/My Drive/RL_Project/models/dqn_custom.h5')

In [None]:
f = open("/content/drive/My Drive/RL_Project/win_final_custom.txt", 'w')
f.write(str(env.n_wins))
f.write(str(wins_list))
f.close()

In [None]:
# Load the TensorBoard notebook extension
%load_ext tensorboard
%tensorboard --logdir logs/DQN/

이거 1분 이상 걸림

In [None]:
with open("/content/drive/My Drive/RL_Project/replay/dqn_custom.pkl", "rb") as fr:
    data = pickle.load(fr)
#print(data)

In [None]:
agent.model.summary()

In [None]:
class DQNAgent_custom(object):
    def __init__(self, env, conv_units, dense_units):
        self.env = env

        # Deep Q-learning Parameters
        self.discount = DISCOUNT
        self.learn_rate = learn_rate
        self.epsilon = epsilon
        self.model = dqn(
            self.learn_rate, self.env.state_im.shape, self.env.ntiles, conv_units, dense_units)

        # target model - this is what we predict against every step
        self.target_model = dqn(
            self.learn_rate, self.env.state_im.shape, self.env.ntiles, conv_units, dense_units)
        self.target_model.set_weights(self.model.get_weights())

        self.replay_memory = deque(maxlen=MEM_SIZE)
        self.target_update_counter = 0

        self.tensorboard = ModifiedTensorBoard(log_dir=f'logs/DQN')

    def get_action(self, state):
        board = state.reshape(1, self.env.ntiles)
        unsolved = [i for i, x in enumerate(board[0]) if x==-0.125]

        rand = np.random.random() # random value b/w 0 & 1

        if rand < self.epsilon: # random move (explore)
            # move = np.random.choice(unsolved)
            if unsolved:
                move = np.random.choice(unsolved)
            else: # 이때 판에서 랜덤으로 가야할 듯
                move = 0
        else:
            moves = self.model.predict(np.reshape(state, (1, self.env.nrows, self.env.ncols, 1)))
            moves[board!=-0.125] = np.min(moves) # set already clicked tiles to min value
            move = np.argmax(moves)

        return move

    def update_replay_memory(self, transition):
        self.replay_memory.append(transition)

    def train(self, done):
        if len(self.replay_memory) < MEM_SIZE_MIN:
            return

        batch = random.sample(self.replay_memory, BATCH_SIZE)

        current_states = np.array([transition[0] for transition in batch])
        current_qs_list = self.model.predict(current_states)

        new_current_states = np.array([transition[3] for transition in batch])
        future_qs_list = self.target_model.predict(new_current_states)

        X,y = [], []

        for i, (current_state, action, reward, new_current_state, done) in enumerate(batch):
            if not done:
                max_future_q = np.max(future_qs_list[i])
                new_q = reward + DISCOUNT * max_future_q
            else:
                new_q = reward

            current_qs = current_qs_list[i]
            current_qs[action] = new_q

            X.append(current_state)
            y.append(current_qs)

        self.model.fit(np.array(X),
                       np.array(y),
                       batch_size=BATCH_SIZE,
                       shuffle=False,
                       verbose=0,
                       callbacks=[self.tensorboard] if done else None)

        # updating to determine if we want to update target_model yet
        if done:
            self.target_update_counter += 1

        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0

        # decay learn_rate
        self.learn_rate = max(LEARN_MIN, self.learn_rate*LEARN_DECAY)

        # decay epsilon
        self.epsilon = max(EPSILON_MIN, self.epsilon*EPSILON_DECAY)