In [3]:
import sys, pathlib, importlib, warnings

repo = pathlib.Path().resolve()          # notebook’s directory
if str(repo) not in sys.path:
    sys.path.insert(0, str(repo))

# hot reload edited modules
for m in ["board", "env", "features", "learners", "agent"]:
    if m in sys.modules:
        importlib.reload(sys.modules[m])

from env import Game2048Env
from learners import make_learner
from agent import Agent

In [4]:
import numpy as np

def train(num_episodes=50_000, alpha=0.01, epsilon=0.1, seed=None):
    env     = Game2048Env(seed=seed)
    learner = make_learner(alpha=alpha)
    agent   = Agent(env, learner, eps=epsilon)

    ep_history, score_history = [], []
    for ep in range(1, num_episodes + 1):
        score = agent.run_episode()
        ep_history.append(ep)
        score_history.append(score)
        if ep % 1_000 == 0:
            print(f"{ep:>6}  mean score last 1k = {np.mean(score_history[-1000:]):.0f}  "
                  f"best = {np.max(score_history[-1000:])}")
    return learner, np.array(ep_history), np.array(score_history)

learner, ep, scores = train(num_episodes=50_000, alpha=0.01, epsilon=0.05, seed=42)

  1000  mean score last 1k = 2212  best = 9976
  2000  mean score last 1k = 3086  best = 11400
  3000  mean score last 1k = 3958  best = 12040
  4000  mean score last 1k = 4525  best = 12356
  5000  mean score last 1k = 4831  best = 12516
  6000  mean score last 1k = 5065  best = 14312
  7000  mean score last 1k = 5347  best = 12772
  8000  mean score last 1k = 5504  best = 14720
  9000  mean score last 1k = 5497  best = 13888
 10000  mean score last 1k = 5497  best = 14068
 11000  mean score last 1k = 5779  best = 13744
 12000  mean score last 1k = 5823  best = 15436
 13000  mean score last 1k = 5844  best = 14476
 14000  mean score last 1k = 5865  best = 14748
 15000  mean score last 1k = 5739  best = 14504
 16000  mean score last 1k = 5987  best = 15336
 17000  mean score last 1k = 5812  best = 15080


KeyboardInterrupt: 