In [10]:
import numpy as np
import gymnasium as gym
from gymnasium import spaces

class NumberGuessEnv(gym.Env):
    metadata = {"render_modes": ["human"]}

    def __init__(self, N=10, max_steps=None):
        super().__init__()
        self.N = N
        self.max_steps = max_steps or N
        self.action_space = spaces.Discrete(self.N)
        self.observation_space = spaces.Dict({
            "low": spaces.Discrete(self.N + 1),
            "high": spaces.Discrete(self.N + 1),
        })
        self._reset_internal()

    def _reset_internal(self):
        self.target = int(self.np_random.integers(1, self.N + 1))
        self.steps = 0
        self.low = 1
        self.high = self.N

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self._reset_internal()
        obs = {"low": self.low, "high": self.high}
        return obs, {}

    def step(self, action):
        self.steps += 1
        guess = int(action) + 1

        if guess == self.target:
            reward = 1.0
            terminated = True
        else:
            reward = -0.01
            terminated = False
            if guess < self.target:
                self.low = max(self.low, guess + 1)
            else:
                self.high = min(self.high, guess - 1)

        truncated = (self.steps >= self.max_steps) and not terminated
        obs = {"low": self.low, "high": self.high}
        return obs, reward, terminated, truncated, {}

    def render(self, mode="human"):
        print(f"Step {self.steps}: bounds = [{self.low}, {self.high}]")


In [11]:
def obs_to_state(obs: dict, N: int) -> int:
    low = obs["low"]
    high = obs["high"]
    return low * (N + 1) + high


In [12]:
N = 100
env = NumberGuessEnv(N=N, max_steps=N)

alpha = 0.1
gamma = 0.99
epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
n_episodes = 200000

state_size = (N + 1) * (N + 1)
action_size = env.action_space.n
Q = np.zeros((state_size, action_size), dtype=np.float32)


In [13]:
n_test = 1000
wins = 0
cnt = 0
for _ in range(n_test):
    obs, _ = env.reset()
    state = obs_to_state(obs, N)
    done = False
    x = 0
    while not done:
        x += 1
        action = int(np.argmax(Q[state]))
        next_obs, reward, terminated, truncated, _ = env.step(action)
        state = obs_to_state(next_obs, N)
        done = terminated or truncated
    wins += (reward == 1.0)
    cnt += x
print("average count of attemps to win: ", cnt / n_test)

print(f"win rate: {wins/n_test:.3f}")

average count of attemps to win:  99.01
win rate: 0.010


In [14]:
for ep in range(1, n_episodes + 1):
    obs, _ = env.reset()
    state = obs_to_state(obs, N)
    done = False

    while not done:
        if np.random.rand() < epsilon:
            action = env.action_space.sample()
        else:
            action = int(np.argmax(Q[state]))

        next_obs, reward, terminated, truncated, _ = env.step(action)
        next_state = obs_to_state(next_obs, N)
        done = terminated or truncated

        best_next = np.max(Q[next_state])
        td_target = reward + gamma * best_next
        Q[state, action] += alpha * (td_target - Q[state, action])
        state = next_state

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if ep % 1000 == 0:
        print(f"Episode {ep}/{n_episodes}, epsilon={epsilon:.3f}")


Episode 1000/200000, epsilon=0.010
Episode 2000/200000, epsilon=0.010
Episode 3000/200000, epsilon=0.010
Episode 4000/200000, epsilon=0.010
Episode 5000/200000, epsilon=0.010
Episode 6000/200000, epsilon=0.010
Episode 7000/200000, epsilon=0.010
Episode 8000/200000, epsilon=0.010
Episode 9000/200000, epsilon=0.010
Episode 10000/200000, epsilon=0.010
Episode 11000/200000, epsilon=0.010
Episode 12000/200000, epsilon=0.010
Episode 13000/200000, epsilon=0.010
Episode 14000/200000, epsilon=0.010
Episode 15000/200000, epsilon=0.010
Episode 16000/200000, epsilon=0.010
Episode 17000/200000, epsilon=0.010
Episode 18000/200000, epsilon=0.010
Episode 19000/200000, epsilon=0.010
Episode 20000/200000, epsilon=0.010
Episode 21000/200000, epsilon=0.010
Episode 22000/200000, epsilon=0.010
Episode 23000/200000, epsilon=0.010
Episode 24000/200000, epsilon=0.010
Episode 25000/200000, epsilon=0.010
Episode 26000/200000, epsilon=0.010
Episode 27000/200000, epsilon=0.010
Episode 28000/200000, epsilon=0.010
E

In [15]:
n_test = 1000
wins = 0
cnt = 0
for _ in range(n_test):
    obs, _ = env.reset()
    state = obs_to_state(obs, N)
    done = False
    x = 0
    while not done:
        x += 1
        action = int(np.argmax(Q[state]))
        next_obs, reward, terminated, truncated, _ = env.step(action)
        state = obs_to_state(next_obs, N)
        done = terminated or truncated
    wins += (reward == 1.0)
    cnt += x
print("average count of attemps to win: ", cnt / n_test)

print(f"win rate: {wins/n_test:.3f}")

average count of attemps to win:  7.314
win rate: 1.000
