Merge pull request #10 from jerryz123/imitation-learning

Imitation learning
BerkeleyAutomation · Nov 16, 2017 · ddb3946 · ddb3946
2 parents b4b8411 + a1f4e60
commit ddb3946
Show file tree

Hide file tree

Showing 16 changed files with 154 additions and 265 deletions.
diff --git a/examples/collect_data.py b/examples/collect_data.py
@@ -10,10 +10,14 @@
 import pygame
 from copy import deepcopy
 from random import random
-
-from gym_urbandriving.agents import AccelAgent, KeyboardAgent, NullAgent
+from gym_urbandriving.agents import AccelAgent, KeyboardAgent, NullAgent, TreeSearchAgent
 from gym_urbandriving import Car
 
+def vectorize_state(state):
+    res = []
+    for obj in state.dynamic_objects:
+        res.extend([obj.x, obj.y, obj.vel, obj.angle])
+    return res
 
 def early_stop_actions(actions):
     """
@@ -30,7 +34,7 @@ def early_stop_actions(actions):
         True if approximately all the cars have gone through the intersection and are back up to speed. 
 
     """
-    return actions == [(0, 1), (0, 1), (0, 1), (0, 1)]
+    return actions[0] == None
 
 def run_and_collect():
     """
@@ -47,24 +51,25 @@ def run_and_collect():
     saved_actions = []
 
     vis = uds.PyGameVisualizer((800, 800))
-    init_state = uds.state.SimpleIntersectionState(ncars=4, nped=0)
+    init_state = uds.state.SimpleIntersectionState(ncars=2, nped=0)
 
     env = uds.UrbanDrivingEnv(init_state=init_state,
                               visualizer=vis,
-                              agent_mappings={Car:NullAgent},
-                              max_time=100,
+                              agent_mappings={Car:AccelAgent},
+                              max_time=200,
+
                               randomize=True,
-                              nthreads=4)
+                              use_ray=True)
 
     env._render()
-    state = init_state
-    agent = AccelAgent()
+    state = env.current_state
+    agent = TreeSearchAgent()
     reset_counter = 0
     action = None
 
     while(True):
-        action = agent.eval_policy(state)
-        saved_states.append(state.vectorize_state())
+        action = agent.eval_policy(deepcopy(state))
+        saved_states.append(vectorize_state(state))
         start_time = time.time()
         state, reward, done, info_dict = env._step(action)
         saved_actions.append(info_dict["saved_actions"])
@@ -74,14 +79,24 @@ def run_and_collect():
             reset_counter+=1
         else:
             reset_counter = 0
-        if done or reset_counter >50:
+
+        env._render(waypoints = agent.waypoints)
+        if done or reset_counter >5:
             # Time to save our current run and reset our env and our saved data
             reset_counter = 0
+            print("done")
+            time.sleep(1)
             env._reset()
             state = env.current_state
-
+
+            # reset agent state
+            agent.waypoints = None
+            agent.actions = None
+
+            pickle.dump((saved_states, saved_actions),open("data/"+str(np.random.random())+"dump.data", "wb+"))
+
             saved_states = []
             saved_actions = []
 
-if __name__ == "__main__":
-  run_and_collect()
+cProfile.run('run_and_collect()', 'temp/stats')
+
diff --git a/examples/learn_model.py b/examples/learn_model.py
@@ -1,18 +1,9 @@
 import glob
 import pickle
 import numpy as np
-from sklearn import svm
-from sklearn.model_selection import GridSearchCV, cross_val_score 
-from sklearn import linear_model
-from sklearn.neural_network import MLPRegressor
-from copy import deepcopy
-
-# Parameter Grids for sci-kit learn
-#param_grid = [{'C': [.01, 1, 100], 'gamma': [1, 10, 100], 'kernel': ['rbf', 'linear']}]
-#param_grid = [{'C': [10], 'gamma': [1], 'kernel': ['rbf']}]
-#param_grid = [{'alpha': [0, .01, .1, 1, 10, 100]}, {'learning_rate_init':[.01, .1, 1, 10, 100]}]
-param_grid = [{'alpha': [0]}, {'learning_rate_init':[1]}]
 
+from sklearn.ensemble import RandomForestClassifier
+from sklearn import tree
 
 def process_files(list_of_paths):
   """
@@ -39,21 +30,20 @@ def process_files(list_of_paths):
     states = data[0]
     actions = data[1]
     assert(len(states) == len(actions))
+    if(len(states)<20): # bad demos that ended early
+      continue
     for t in range(len(states)): # iterate through time
-      for o in range(len(actions[t])): # iterate through objects
-        if not actions[t][o] is None: # TODO: fix this, accel agents should't return None?
-          print(states[t])
+        if not actions[t][0] is None: # TODO: fix this, accel agents should't return None?
           X.append(np.array(states[t])) # TODO: fix this after state vectorization works
-          y.append(actions[t][o][1])
+          y.append(actions[t][0])
 
   X = np.matrix(X)
   y = np.array(y)
   # TODO: add feature normalization
   print(X.shape)
-  print(y.shape)        
-  print(X)
-  print(y)
-  return X,y       
+  print(y.shape)
+
+  return X,y  
 
 
 def learn():
@@ -69,32 +59,24 @@ def learn():
 
   all_data = glob.glob("data/*dump.data")
   train_data = all_data[0:len(all_data)*8//10]
-  validation_data = all_data[len(all_data)*8//10: len(all_data)] # TODO: fix when there are <5 files
+  validation_data = all_data[len(all_data)*8//10: len(all_data)]
   train_X, train_y = process_files(train_data)
   valid_X, valid_y = process_files(validation_data)
-  #all_X, all_y = process_files(all_data)
-  svc = svm.SVC(kernel='rbf')
-  lr = linear_model.Lasso()
-  mlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100), learning_rate='adaptive', max_iter = 10000, tol=1e-5)
-  model = GridSearchCV(estimator=mlp, param_grid=param_grid)
-  print(train_X.shape)
+
+  model = RandomForestClassifier(n_estimators=10, criterion='gini', max_features=None, max_depth=15)
+
   model.fit(train_X, train_y)
   train_yp = model.predict(train_X)
   valid_yp = model.predict(valid_X)
-  """
-  train_error = np.mean( train_y != train_yp)
-  valid_error = np.mean( valid_y != valid_yp)
-  sanity_error = np.mean( train_yp != 1)
-  sanity_error2 = np.mean( train_y != 1)
-  """
-  train_error = np.mean(np.square(train_y - train_yp))
-  valid_error = np.mean(np.square(valid_y - valid_yp))
-  sanity_error = np.mean(train_yp < .5)
-  sanity_error2 =  np.mean(train_y < .5)
+
+  train_error = np.mean(train_y != train_yp)
+  valid_error = np.mean(valid_y != valid_yp)
+
 
   # TODO: more informative printout
   print(len(train_data), len(validation_data))
-  print(sanity_error, sanity_error2, train_error, valid_error)
+  print(train_error, valid_error)
+
   pickle.dump(model, open("model.model", "wb"))
 
 if __name__ == "__main__":

diff --git a/examples/test.py b/examples/test.py
@@ -2,11 +2,11 @@
 import gym_urbandriving as uds
 import cProfile
 import time
+import numpy as np
 
 from gym_urbandriving.agents import KeyboardAgent, AccelAgent, NullAgent, TrafficLightAgent
 from gym_urbandriving.assets import Car, TrafficLight
 
-import numpy as np
 
 """
  Test File, to demonstrate general functionality of environment
@@ -51,6 +51,7 @@ def f():
         # Simulate the state
         state, reward, done, info_dict = env._step(action)
         env._render()
+        # keep simulator running in spite of collisions or timing out
         done = False
         # If we crash, sleep for a moment, then reset
         if done:

diff --git a/examples/test_model.py b/examples/test_model.py
@@ -5,7 +5,7 @@
 import numpy as np
 import pickle
 
-from gym_urbandriving.agents import ModelAgent
+from gym_urbandriving.agents import ModelAgent, AccelAgent
 from gym_urbandriving.assets import Car
 
 def test_model():
@@ -24,14 +24,15 @@ def test_model():
     start_time = time.time()
 
     vis = uds.PyGameVisualizer((800, 800))
-    init_state = uds.state.SimpleIntersectionState()
+    init_state = uds.state.SimpleIntersectionState(ncars=2, nped=0)
 
     env = uds.UrbanDrivingEnv(init_state=init_state,
                               visualizer=vis,
-                              agent_mappings={Car:ModelAgent},
-                              max_time=250,
+                              agent_mappings={Car:AccelAgent},
+                              max_time=200,
                               randomize=True,
-                              nthreads=4)
+                              use_ray=True)
+
     env._render()
     state = init_state
     agent = ModelAgent()
@@ -57,10 +58,9 @@ def test_model():
 
             print("done")
             print((time.time()-start_time)/totalticks, totalticks)
-            print(info_dict["dynamic_collisions"])
 
-            accs += info_dict["predict_accuracy"]
-            print(accs/totalticks)
+            #accs += info_dict["predict_accuracy"]
+            #print(accs/totalticks)
 
             env._reset()
             state = env.current_state

diff --git a/examples/tree_search_train.py b/examples/tree_search_train.py
@@ -3,7 +3,7 @@
 import cProfile
 import time
 
-from gym_urbandriving.agents import  NullAgent, TreeSearchAgent, SimplePathAgent, AccelAgent
+from gym_urbandriving.agents import  NullAgent, TreeSearchAgent, AccelAgent
 
 import numpy as np
 import pygame
@@ -19,24 +19,28 @@ def run():
 
     Examples
     --------
-    python3 examples/test_path.py
+    python3 examples/tree_search_train.py
 
     """
 
     vis = uds.PyGameVisualizer((800, 800))
-    init_state = uds.state.SimpleIntersectionState(ncars=2, nped=0)
+    init_state = uds.state.SimpleIntersectionState(ncars=3, nped=0)
 
 
     env = uds.UrbanDrivingEnv(init_state=None,
                               visualizer=vis,
-                              agent_mappings={Car:NullAgent},
+                              agent_mappings={Car:AccelAgent},
                               max_time=-1,
-                              randomize=False,
-    )
+                              randomize=True,
+                              use_ray=True)
+
 
     env._reset()
     state = env.current_state
-    agent = TreeSearchAgent()
+
+    # To see the training in action
+    agent = TreeSearchAgent(vis = vis)
+
     action = None
 
     while(True):
@@ -48,12 +52,12 @@ def run():
         if done:
             print("done")
             time.sleep(1)
-            print(info_dict["dynamic_collisions"])
             env._reset()
             state = env.current_state
+
+            # reset agent state
             agent.waypoints = None
             agent.actions = None
 
+cProfile.run('run()', 'temp/stats')
 
-if __name__ == "__main__":
-  run()
diff --git a/gym_urbandriving/agents/accel_agent.py b/gym_urbandriving/agents/accel_agent.py
@@ -16,12 +16,12 @@ class AccelAgent:
     planning_env : UrbanDrivingEnv
         World simulator used internally to plan
     """
-    actions = [(0, 1), (2, 1), (-2, 1), (0, 0), (1, -1), (-1, -1)]
-    #actions = [(0,1),(0,0),(0,-1)]
+
     def __init__(self, agent_num=0):
         self.agent_num = agent_num
         from gym_urbandriving import UrbanDrivingEnv
         self.planning_env = UrbanDrivingEnv(init_state=None)
+        self.valid_actions = [(0, 1), (2, 1), (-2, 1), (0, 0), (1, -1), (-1, -1)]
         return
 
     def eval_policy(self, state, nsteps=8):
@@ -42,12 +42,13 @@ def eval_policy(self, state, nsteps=8):
         action
             Best action
         """
+
         self.planning_env._reset(state)
         start_pos = state.dynamic_objects[self.agent_num].get_pos()
         best_action = None
         best_time = 0
         best_distance = 0
-        for action in self.actions:
+        for action in self.valid_actions:
             self.planning_env._reset()
             pos = state.dynamic_objects[self.agent_num].get_pos()
             dist_to_coll = state.min_dist_to_coll(self.agent_num)

diff --git a/gym_urbandriving/agents/keyboard_agent.py b/gym_urbandriving/agents/keyboard_agent.py
@@ -13,6 +13,7 @@ class KeyboardAgent:
     def __init__(self, agent_num=0):
         self.agent_num = agent_num
         return
+
     def eval_policy(self, state):
         """
         Returns action based on keyboard input

diff --git a/gym_urbandriving/agents/model_agent.py b/gym_urbandriving/agents/model_agent.py
@@ -10,9 +10,15 @@ def __init__(self, agent_num=0):
         self.model = pickle.load(open("model.model", "rb"))
         self.score = 0
         return
+
+    def vectorize_state(self, state):
+        res = []
+        for obj in state.dynamic_objects:
+            res.extend([obj.x, obj.y, obj.vel, obj.angle])
+        return res
+
 
-
-    def eval_policy(self, state, nsteps=8):
+    def eval_policy(self, state):
         """
         If we can accelerate, see if we crash in nsteps.
         If we crash, decelerate, else accelerate
@@ -45,21 +51,5 @@ def eval_policy(self, state, nsteps=8):
                 best_action = action
                 best_time = time
         """
-
-        # Our prediction
-        pred_class = self.model.predict(np.array([state.vectorize_state()]))
-        our_action = (0,pred_class[0])
-
-        # TODO: fix arbitrary quantization
-        our_action = (0,1)
-        if pred_class<0:
-            our_action = (0,-1)
-        elif pred_class<.5:
-            our_action = (0,0)
-        else:
-            our_action = (0,1)
-
-
-        #self.score += (best_action[1]-pred_class[0])**2
 
-        return our_action
+        return self.model.predict(np.array([self.vectorize_state(state)]))[0]