remove doom memory

GreenWizard2015 · GreenWizard2015 · commit e11e36996363 · 2021-01-08T21:15:16.000+02:00
diff --git a/Core/MazeRLWrapper.py b/Core/MazeRLWrapper.py
@@ -42,11 +42,11 @@ def apply(self, actionIndex):
     if self._env.dead: # unreachable due to actions masking 
       return nextState, -10, True, prevState
 
-    if 0.95 <= self._env.score: 
+    if 0.99 <= self._env.score: 
       return nextState, 0, True, prevState
     
     if self._movingLoop():
-      return nextState, -5, True, prevState
+      return nextState, 0, True, prevState
 
     self._done = False
     reward = 0.3 if isNewCell else 0 # small reward for visiting new cell
diff --git a/README.md b/README.md
@@ -64,7 +64,6 @@
 
 # Идеи и эксперименты
 
-- [ ] Заменить разделение памяти/эпизодов на основные и после попадания в цикл.
 - [ ] Реализовать дистилляцию нескольких политик, используя доп. награды или иные методы.
 - [ ] Сравнить обученного без учителя агента с обученным с учителем. (500 эпох)
 - [ ] Обучить агента, который не получает информацию о своих перемещениях (только с данным об окружении).
diff --git a/distillation.py b/distillation.py
@@ -106,10 +106,6 @@ def learn_environment(teacher, model, params):
   ]
   
   memory = CebPrioritized(maxSize=5000, sampleWeight='abs')
-  doomMemory = CebLinear(
-    maxSize=params.get('max steps after loop', 16) * 10000,
-    sampleWeight='abs'
-  )
   trainableModel, teacherPower = wrapStudentModel(model)
   ######################################################
   def withTeacherPredictions(replay):
@@ -119,24 +115,20 @@ def withTeacherPredictions(replay):
   
   def testModel(EXPLORE_RATE):
     for e in environments: e.reset()
-    replays = Utils.emulateBatch(
-      environments,
-      DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
-      maxSteps=params.get('max test steps')
-    )
-    for replay, _ in replays:
-      if params.get('clip replay', False):
-        replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT)
-      if BOOTSTRAPPED_STEPS < len(replay):
-        memory.addEpisode(withTeacherPredictions(replay), terminated=True)
+    replays = [replay for replay, _ in Utils.emulateBatch(
+        environments,
+        DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
+        maxSteps=params.get('max test steps')
+      )
+    ]
     
-    scores = [x.score for x in environments]
     ################
-    # collect bad experience
-    envs = [e for e in environments if e.hitTheLoop]
-    if envs:
+    # explore if hit the loop
+    envsIndexes = [i for i, e in enumerate(environments) if e.hitTheLoop]
+    if envsIndexes:
+      envs = [environments[i] for i in envsIndexes]
       for e in envs: e.Continue()
-      replays = Utils.emulateBatch(
+      exploreReplays = Utils.emulateBatch(
         envs,
         DQNAgent(
           model,
@@ -145,11 +137,13 @@ def testModel(EXPLORE_RATE):
         ),
         maxSteps=params.get('max steps after loop', 16)
       )
-      for replay, _ in replays:
-        if BOOTSTRAPPED_STEPS < len(replay):
-          doomMemory.addEpisode(withTeacherPredictions(replay), terminated=True)
+      for ind, (replay, _) in zip(envsIndexes, exploreReplays):
+        replays[ind] += replay[1:]
     ################
-    return scores
+    for replay in replays:
+      if BOOTSTRAPPED_STEPS < len(replay):
+        memory.addEpisode(withTeacherPredictions(replay), terminated=True)
+    return [x.score for x in environments]
   ######################################################
   # collect some experience
   for _ in range(2):
@@ -181,19 +175,6 @@ def testModel(EXPLORE_RATE):
       }
     )
     print('Avg. train loss: %.4f' % trainLoss)
-    
-    if BATCH_SIZE < len(doomMemory):
-      trainLoss = train(
-        model, trainableModel, doomMemory,
-        {
-          'gamma': GAMMA,
-          'batchSize': BATCH_SIZE,
-          'steps': BOOTSTRAPPED_STEPS,
-          'episodes': params['train doom episodes'](epoch),
-          'alpha': params.get('doom alpha', lambda _: alpha)(epoch)
-        }
-      )
-    print('Avg. train doom loss: %.4f' % trainLoss)
     ##################
     # test
     print('Testing...')
@@ -249,13 +230,11 @@ def testModel(EXPLORE_RATE):
     'warm up epochs': 0,
     'test episodes': 128,
     'train episodes': lambda _: 128,
-    'train doom episodes': lambda _: 32,
 
     'alpha': lambda _: 1,
     'explore rate': lambda _: 0,
     
     'agent noise': 0.01,
-    'clip replay': True,
     
     'explore rate after loop': 0.2,
     'agent noise after loop': 0.1,
diff --git a/learn_environment.py b/learn_environment.py
@@ -5,47 +5,36 @@
 import Utils
 import fit_stage
 import os
-from Utils.ExperienceBuffers.CebLinear import CebLinear
 
 def learn_environment(model, params):
   NAME = params['name']
   BATCH_SIZE = params['batch size']
   GAMMA = params['gamma']
   BOOTSTRAPPED_STEPS = params['bootstrapped steps']
-  LOOP_LIMIT = params['maze']['loop limit']
   metrics = {}
 
   environments = [
     MazeRLWrapper(params['maze']) for _ in range(params['test episodes'])
   ]
   
   memory = CebPrioritized(maxSize=5000, sampleWeight='abs')
-  doomMemory = CebLinear(
-    maxSize=params.get('max steps after loop', 16) * 1000,
-    sampleWeight='abs'
-  )
-  
   ######################################################
   def testModel(EXPLORE_RATE):
     for e in environments: e.reset()
-    replays = Utils.emulateBatch(
-      environments,
-      DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
-      maxSteps=params.get('max test steps')
-    )
-    for replay, _ in replays:
-      if params.get('clip replay', False):
-        replay = Utils.clipReplay(replay, loopLimit=LOOP_LIMIT)
-      if BOOTSTRAPPED_STEPS < len(replay):
-        memory.addEpisode(replay, terminated=True)
+    replays = [replay for replay, _ in Utils.emulateBatch(
+        environments,
+        DQNAgent(model, exploreRate=EXPLORE_RATE, noise=params.get('agent noise', 0)),
+        maxSteps=params.get('max test steps')
+      )
+    ]
     
-    scores = [x.score for x in environments]
     ################
-    # collect bad experience
-    envs = [e for e in environments if e.hitTheLoop]
-    if envs:
+    # explore if hit the loop
+    envsIndexes = [i for i, e in enumerate(environments) if e.hitTheLoop]
+    if envsIndexes:
+      envs = [environments[i] for i in envsIndexes]
       for e in envs: e.Continue()
-      replays = Utils.emulateBatch(
+      exploreReplays = Utils.emulateBatch(
         envs,
         DQNAgent(
           model,
@@ -54,11 +43,13 @@ def testModel(EXPLORE_RATE):
         ),
         maxSteps=params.get('max steps after loop', 16)
       )
-      for replay, _ in replays:
-        if BOOTSTRAPPED_STEPS < len(replay):
-          doomMemory.addEpisode(replay, terminated=True)
+      for ind, (replay, _) in zip(envsIndexes, exploreReplays):
+        replays[ind] += replay[1:]
     ################
-    return scores
+    for replay in replays:
+      if BOOTSTRAPPED_STEPS < len(replay):
+        memory.addEpisode(replay, terminated=True)
+    return [x.score for x in environments]
   ######################################################
   # collect some experience
   for _ in range(2):
@@ -86,19 +77,6 @@ def testModel(EXPLORE_RATE):
       }
     )
     print('Avg. train loss: %.4f' % trainLoss)
-    
-    if BATCH_SIZE < len(doomMemory):
-      trainLoss = fit_stage.train(
-        model, doomMemory,
-        {
-          'gamma': GAMMA,
-          'batchSize': BATCH_SIZE,
-          'steps': BOOTSTRAPPED_STEPS,
-          'episodes': params['train doom episodes'](epoch),
-          'alpha': params.get('doom alpha', lambda _: alpha)(epoch)
-        }
-      )
-      print('Avg. train doom loss: %.4f' % trainLoss)
     ##################
     # test
     print('Testing...')
diff --git a/train.py b/train.py
@@ -52,13 +52,11 @@ def getModel(shape):
     'warm up epochs': 0,
     'test episodes': 128,
     'train episodes': lambda _: 128,
-    'train doom episodes': lambda _: 32,
 
     'alpha': lambda _: 1,
     'explore rate': lambda _: 0,
     
     'agent noise': 0.01,
-    'clip replay': True,
     
     'explore rate after loop': 0.2,
     'agent noise after loop': 0.1
diff --git a/view_maze.py b/view_maze.py
@@ -20,7 +20,7 @@
 from model import createModel
 
 def createMaze():
-  sz = 16 * 4
+  sz = 64
   maze = (0.8 < np.random.rand(sz, sz)).astype(np.float32)
   res = CMazeEnvironment(
     maze=maze,