In [1]:
# ─── 2. add project root to sys.path (only if needed) ──────────────────
import sys, pathlib
from tqdm.auto import tqdm
repo = pathlib.Path().resolve()
if str(repo) not in sys.path:
    sys.path.insert(0, str(repo))

# ─── 3. regular imports — they'll update automatically after edits ─────
from env      import Game2048Env
from learners import FeatureTD0Learner
from agent    import RLAgent
from features import pattern, info

In [6]:
import random, numpy as np, time
random.seed(42)                        # reproducibility

# ----- build env, learner, agent -----
env = Game2048Env(ascii_render=False, gui=False)

learner = FeatureTD0Learner(alpha=0.005, gamma=0.99)
learner.add_feature(pattern([0,1,2,3,4,5], iso=8))   # 8-tuple over first two rows

agent = RLAgent(env, learner, epsilon=0.010, decay=1, eps_min=0.002)

Registered feature: 6-tuple pattern 012345 (size = 16777216, 64 MB)


In [7]:
EPISODES      = 20_000
STATS_EVERY   = 500

pbar = tqdm(range(1, EPISODES + 1), desc="Training", unit="ep")

for ep in pbar:
    score = agent.run_episode()                         # play one game
    learner.record_episode(env.b, score)                # store outcome

    # print aggregated stats every STATS_EVERY episodes
    if ep % STATS_EVERY == 0:
        learner.flush_stats(ep, unit=STATS_EVERY)       # prints via info()

Training:   0%|          | 0/20000 [00:00<?, ?ep/s]

500	avg = 2634.0	max = 10404.0
	32	100.0%	(0.8%)
	64	99.2%	(6.6%)
	128	92.6%	(37.4%)
	256	55.2%	(46.6%)
	512	8.6%	(8.2%)
	1024	0.4%	(0.4%)
1000	avg = 2856.5	max = 10216.0
	32	100.0%	(0.2%)
	64	99.8%	(6.2%)
	128	93.6%	(33.6%)
	256	60.0%	(46.6%)
	512	13.4%	(13.2%)
	1024	0.2%	(0.2%)
1500	avg = 3035.2	max = 11688.0
	32	100.0%	(0.2%)
	64	99.8%	(4.8%)
	128	95.0%	(28.8%)
	256	66.2%	(52.6%)
	512	13.6%	(13.4%)
	1024	0.2%	(0.2%)
2000	avg = 3183.7	max = 12972.0
	64	100.0%	(3.8%)
	128	96.2%	(28.4%)
	256	67.8%	(51.8%)
	512	16.0%	(15.4%)
	1024	0.6%	(0.6%)
2500	avg = 3452.6	max = 8120.0
	64	100.0%	(2.6%)
	128	97.4%	(23.8%)
	256	73.6%	(50.8%)
	512	22.8%	(22.8%)
3000	avg = 3516.6	max = 13328.0
	32	100.0%	(0.2%)
	64	99.8%	(1.4%)
	128	98.4%	(23.6%)
	256	74.8%	(51.8%)
	512	23.0%	(22.6%)
	1024	0.4%	(0.4%)
3500	avg = 3578.0	max = 13736.0
	64	100.0%	(2.6%)
	128	97.4%	(24.2%)
	256	73.2%	(49.2%)
	512	24.0%	(23.2%)
	1024	0.8%	(0.8%)
4000	avg = 3691.3	max = 12552.0
	64	100.0%	(2.4%)
	128	97.6%	(21.6%)
	256	76.0%

In [8]:
learner.save("20000_episodes_eps=0.01_alpha=0.05.pkl") 

[FeatureTD0] saved feature list → 20000_episodes_eps=0.01_alpha=0.05.pkl
