# Minesweeper solver with Stable Baselines3 DQN

References
> 1. [(medium) article for stable baselines](https://towardsdatascience.com/stable-baselines-a-fork-of-openai-baselines-reinforcement-learning-made-easy-df87c4b2fc82)
> 1. [(colab) example of medium article](https://colab.research.google.com/github/Stable-Baselines-Team/rl-colab-notebooks/blob/master/saving_loading_dqn.ipynb)
> 1. [(github) minesweeper gym environment](https://github.com/aylint/gym-minesweeper)

Helps
> 1. [(github) stable-baselines3](https://github.com/DLR-RM/stable-baselines3)
> 1. [(github) stable-baselines3-contrib](https://github.com/Stable-Baselines-Team/stable-baselines3-contrib)
> 1. [(github) stable-baselines](https://github.com/hill-a/stable-baselines)
> 1. [(doc) stable-baselines](https://stable-baselines.readthedocs.io/en/master/)
> 1. [(doc) stable-baselines3](https://stable-baselines3.readthedocs.io/en/master/index.html)
> 1. [(doc) stable-baselines3-contrib](https://sb3-contrib.readthedocs.io/en/master/index.html)

In [1]:
import sys, os

os.environ['CUDA_VISIBLE_DEVICES'] = '7'

In [None]:
import gym
import numpy as np
import torch
import torch.nn as nn

from stable_baselines3 import DQN
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.logger import TensorBoardOutputFormat, configure
from stable_baselines3.common.env_checker import check_env
from stable_baselines3.common.torch_layers import BaseFeaturesExtractor
#from stable_baselines3.her.her_replay_buffer import HerReplayBuffer
from typing import Callable

from minesweeper_gym_env import MinesweeperEnv
from MinesweeperModifiedEnv import MinesweeperModifiedEnv

In [None]:
class CustomCNN(BaseFeaturesExtractor):
    """
    :param observation_space: (gym.Space)
    :param features_dim: (int) Number of features extracted.
        This corresponds to the number of unit for the last layer.
    """

    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 512):
        super(CustomCNN, self).__init__(observation_space, features_dim)
        # We assume CxHxW images (channels first)
        # Re-ordering will be done by pre-preprocessing or wrapper
        n_input_channels = observation_space.sample()[None].shape[0]
        self.cnn = nn.Sequential(
            nn.Conv2d(n_input_channels, 128, kernel_size=3, stride=1, padding='same', bias=True),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same', bias=True),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same', bias=True),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, stride=1, padding='same', bias=True),
            nn.ReLU(),
            nn.Flatten(),
        )

        # Compute shape by doing one forward pass
        with torch.no_grad():
            n_flatten = self.cnn(
                torch.as_tensor(observation_space.sample()[None]).float()
            ).shape[1]

        self.linear = nn.Sequential(
            nn.Linear(n_flatten, features_dim, bias=True),
            nn.ReLU(),
            nn.Linear(features_dim, features_dim, bias=True),
            nn.ReLU()
        )


    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        return self.linear(self.cnn(observations))

policy_kwargs = dict(
    features_extractor_class=CustomCNN,
    features_extractor_kwargs=dict(features_dim=512),
)

In [None]:
def linear_schedule(initial_value: float) -> Callable[[float], float]:
    """
    Linear learning rate schedule.

    :param initial_value: Initial learning rate.
    :return: schedule that computes
      current learning rate depending on remaining progress
    """
    #lr0 = initial_value
    def func(progress_remaining: float) -> float:
        """
        Progress will decrease from 1 (beginning) to 0.

        :param progress_remaining: = 1.0 - (num_timesteps / total_timesteps)
        :return: current learning rate
        """
        if progress_remaining > 0.8:
            return initial_value
        else:
            return progress_remaining * initial_value * 1.25
        #return progress_remaining * initial_value
        #nonlocal lr0
        #lr0 = max(0.001, lr0 * 0.99975) # 0.99975
        #return lr0

    return func

In [None]:
def evaluate(model, env, num_episodes=10000):
    """
    Evaluate a RL agent
    :param model: (BaseRLModel object) the RL Agent
    :param num_steps: (int) number of timesteps to evaluate it
    :return: (float) Mean reward for the last 100 episodes
    """
    episode_rewards = [0.0]
    episode_wins = []
    for i in range(num_episodes):
        obs = env.reset()
        episode_rewards.append(0.0)
        #if i % 100 == 1:
        #    print('Playing episode {}'.format(i))
        while True:
            action, _states = model.predict(obs)
            obs, reward, done, info = env.step(action)
            
            episode_rewards[-1] += reward
            if done:
                episode_wins.append(info.get('is_success'))
                break
            elif info.get('num_actions') > 200:
                #print('Episode {}. Over action in obs, action: \n{}, {}'.format(i, obs, action))
                episode_wins.append(False)
                break
    """
    # Compute mean reward for the last 100 episodes
    mean_100ep_reward = round(np.mean(episode_rewards[-100:]), 1)
    print("Mean reward:", mean_100ep_reward, "Num episodes:", len(episode_rewards))
    """
    win_rate = round(np.mean(episode_wins), 2)
    print("Win rates:", win_rate, "Num episodes:", len(episode_wins))
    
    return episode_rewards, episode_wins

## DQN for Oirginal Minesweeper

In [13]:
env = MinesweeperModifiedEnv(4, 1)
model = DQN('CnnPolicy', env, 
            learning_rate=linear_schedule(0.001), 
            policy_kwargs=policy_kwargs,
            batch_size=64, 
            gamma=0.1, 
            train_freq=(1, 'episode'), 
            learning_starts=1,
            exploration_fraction=0.16, 
            exploration_initial_eps=0.95, 
            exploration_final_eps=0.01,
            tensorboard_log="./dqn_tensorboard/", verbose=0
           )

In [14]:
model.learn(total_timesteps=int(1e5), 
            log_interval=10,
            tb_log_name='s4m1',
            reset_num_timesteps=True)
model.save("dqn_minesweeper_test_env")
del model  # delete trained model to demonstrate loading

In [217]:
model = DQN.load("dqn_minesweeper_s4m1")
episode_rewards, episode_wins = evaluate(model, env=env, num_episodes=1000)
mean_reward = round(np.mean(episode_rewards), 2)
print('mean_reward: {}'.format(mean_reward))

Win rates: 0.88 Num episodes: 1000
mean_reward: 1.34


In [220]:
model = DQN.load("dqn_minesweeper_s4m2")
episode_rewards, episode_wins = evaluate(model,
                                         env=MinesweeperDiscreetEnv(),
                                         num_episodes=1000)
mean_reward = round(np.mean(episode_rewards), 2)
print('mean_reward: {}'.format(mean_reward))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  shape=(1, self.board_size, self.board_size), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.valid_actions = np.ones((self.board_size * self.board_size), dtype=np.bool)


Win rates: 0.57 Num episodes: 1000
mean_reward: -4.82


In [222]:
model = DQN.load("dqn_minesweeper_s5m3")
episode_rewards, episode_wins = evaluate(model,
                                         env=MinesweeperDiscreetEnv(),
                                         num_episodes=1000)
mean_reward = round(np.mean(episode_rewards), 2)
print('mean_reward: {}'.format(mean_reward))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  shape=(1, self.board_size, self.board_size), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.valid_actions = np.ones((self.board_size * self.board_size), dtype=np.bool)


Win rates: 0.54 Num episodes: 1000
mean_reward: -0.04


In [223]:
model = DQN.load("dqn_minesweeper_s5m3_wr0.29")
episode_rewards, episode_wins = evaluate(model,
                                         env=MinesweeperDiscreetEnv(),
                                         num_episodes=1000)
mean_reward = round(np.mean(episode_rewards), 2)
print('mean_reward: {}'.format(mean_reward))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  shape=(1, self.board_size, self.board_size), dtype=np.int)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  self.valid_actions = np.ones((self.board_size * self.board_size), dtype=np.bool)


Win rates: 0.27 Num episodes: 1000
mean_reward: -10.81


## DQN for Modified Minesweeper

In [None]:
env = MinesweeperModifiedEnv(4, 1)
model = DQN('CnnPolicy', env, 
            learning_rate=linear_schedule(0.001), 
            policy_kwargs=policy_kwargs,
            batch_size=64, 
            gamma=0.1, 
            train_freq=(1, 'episode'), 
            learning_starts=1,
            exploration_fraction=0.16, 
            exploration_initial_eps=0.95, 
            exploration_final_eps=0.02,
            tensorboard_log="./custom_dqn_tensorboard/", verbose=0
           )
model.learn(total_timesteps=int(5e5), 
            log_interval=10,
            tb_log_name='s4m1',
            #eval_log_path='eval_test',
            reset_num_timesteps=True)
model.save("custom_dqn_minesweeper_s4m1")
del model  # delete trained model to demonstrate loading

In [None]:
model = DQN.load("custom_dqn_minesweeper_s4m1w8")
episode_rewards, episode_wins = evaluate(model, env=MinesweeperModifiedEnv(4, 1), num_episodes=1000)
mean_reward = round(np.mean(episode_rewards), 2)
print('mean_reward: {}'.format(mean_reward))