In [None]:
# You will need to run this block twice to make it effective
!apt-get update 
!apt-get install cmake 
!pip install --upgrade setuptools 
!pip install ez_setup 
!pip install gym[atari]
!pip install box2d-py 
!pip install gym[Box_2D] 
!pip install gym[box2d]

!pip install gym pyvirtualdisplay 
!apt-get install -y xvfb python-opengl ffmpeg 

In [None]:
import gym
from gym.wrappers import Monitor
import glob
import io
import base64
from IPython.display import HTML
from pyvirtualdisplay import Display
from IPython import display as ipythondisplay

import torch
import torch.nn as nn
import numpy as np
from torch import nn
import copy
from collections import deque
import random
from tqdm import tqdm
import matplotlib.pyplot as plt
import math

from torch import randint
from time import sleep
import pickle
import statistics as st
from gym.core import RewardWrapper
import gc



display = Display(visible=0, size=(1400, 900))
display.start()

"""
Utility functions to enable video recording of gym environment 
and displaying it.
To enable video, just do "env = wrap_env(env)""
"""



def show_video():
  mp4list = glob.glob('video/*.mp4')
  if len(mp4list) > 0:
    mp4 = mp4list[0]
    video = io.open(mp4, 'r+b').read()
    encoded = base64.b64encode(video)
    ipythondisplay.display(HTML(data='''<video alt="test" autoplay 
                loop controls style="height: 400px;">
                <source src="data:video/mp4;base64,{0}" type="video/mp4" />
             </video>'''.format(encoded.decode('ascii'))))
  else: 
    print("Could not find video")
    

def wrap_env(env):
  #env = RecordVideo(env, './video')
  env = Monitor(env, './video', force=True)
  return env
  
def get_speed(state):
    subimage = (state[84:96,13:14]-0.495)*10
    speed = np.sum(subimage)
    return speed

def process_image(image):
  # process image
  red = image[:,:,0:1]*0.55
  green = image[:,:,1:2]*-0.45 + 255*0.495
  image = np.squeeze(red+green)/255
  image[image<0.4] = 0

  return image

def plot_image(array):
  plt.imshow(array, cmap='gray', vmin=0, vmax=1)
  plt.show()



def simulate(agent=None,env=None,epsilon=0,memory=None,render=False):
  agent.eval()
  with torch.no_grad():
    if(render):
      env = wrap_env(env)
      env.render()
    image, done, reward = env.reset(), False, 0

    #skip first few useless episodes
    for i in range(0,70):
      image, rew, done, info = env.step([0,0,0])
    state = process_image(image)
    ep_len=0

    # counter for bad experiences
    bad_exp_cnt = -8
    bad_cnt_lim = 2

    # time discretization
    time_step = 5

    while not done:
        # exploitation(0) vs exploration(1)
        sample = torch.bernoulli(torch.tensor(epsilon).float())
        if(sample==1):
          A = torch.randint(0,3,(1,))
        else:
          A = agent.get_action(state)

        # progress a time step
        rew = 0
        for i in range(0,time_step):
            next_image, r, done, info = env.step(agent.convert_action(A,state))
            if(done):
              break
            ep_len+=1
            rew+=r
        if(done):
          break
        next_state = process_image(next_image)

        # severe negative reward for consecutive bad actions or overspeeding
        if(rew<0):
          bad_exp_cnt+=1
        else:
          bad_exp_cnt=min(bad_exp_cnt,0)
        if(bad_exp_cnt>=bad_cnt_lim):
          rew+=-100
        reward += rew

        # collect memory
        if(memory!=None):
          memory.collect([state, A, rew, next_state])
        state = next_state

        # stop criteria 
        if(ep_len>2000 or bad_exp_cnt>=bad_cnt_lim):
          break

    # readd 100 to episode reward to resync measured reward with documentation (undo the -100 penalty)
    if(bad_exp_cnt>=bad_cnt_lim):
      reward += 100 + bad_cnt_lim*time_step*0.1
    if(render):
      print("reward",reward,"ep_len",ep_len)
      env.close()
      show_video()
  
  return reward,ep_len


def test_model(agent, env, episodes=1):
  rewards = []
  ep_lens = []
  for i in range(0,episodes):
    rew,ep_len = simulate(agent,env)
    rewards.append(rew)
    ep_lens.append(ep_len)
    print("Test "+str(i+1)+"/"+str(episodes)+": reward =",rew," episode len =",ep_len)
  print("\nAverage Reward = ",sum(rewards)/len(rewards),"Average Ep_len = ",sum(ep_lens)/len(ep_lens),"\n")
  return rewards,ep_lens


class ExperienceReplay(object):
# one entry is [state,action,reward,next_state]
  def  __init__(self, length):
    self.experience_replay = deque(maxlen=length)
  def collect(self,experience):
    self.experience_replay.append(experience)
    return
  def sample_from_experience(self, sample_size):
    sample_size = min(sample_size,len(self.experience_replay))
    sample = random.sample(self.experience_replay,sample_size)
    state = torch.tensor([episode[0] for episode in sample]).float()
    action = torch.tensor([episode[1] for episode in sample]).float()
    reward = torch.tensor([episode[2] for episode in sample]).float()
    next_state = torch.tensor([episode[3] for episode in sample]).float()

    return state,action,reward,next_state



class DQN_Network(nn.Module):
  def __init__(self,gamma = None,lr = None):
    super().__init__()
    #layers
    self.LeakyReLU = nn.LeakyReLU()
    self.conv1 = nn.Conv2d(1,8,kernel_size = 7, stride = 4,padding = 0)
    self.conv2 = nn.Conv2d(8,16,kernel_size = 3, stride = 1,padding = 2)
    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
    self.fc1 = nn.Linear(577,256)
    self.fc2 = nn.Linear(256,50)
    self.fc3 = nn.Linear(50,3)
    self.batchnormCNN1 = nn.BatchNorm2d(num_features = 8)
    self.batchnormCNN2 = nn.BatchNorm2d(num_features = 16)
    self.batchnormFC1 = nn.BatchNorm1d(num_features = 256)
    self.flatten = nn.Flatten()
    self.gamma = gamma
    self.lr = lr
  def forward(self,x):
    # reformat image (input = BS,96,96, or 96,96) (output = BS,1,96,96)
    x = torch.from_numpy(np.ascontiguousarray(x)).float()
    if(x.dim()==2):
      x = torch.unsqueeze(x,dim=0)
      x = torch.unsqueeze(x,dim=0)
    elif(x.dim()==3):
      x = torch.unsqueeze(x,dim=1)
    subimage = (x[:,:,84:96,13:14]-0.495)*10
    speed = torch.sum(subimage,dim=(2,3))
    x = x[:,:,:84,:]
    #plot_image(np.squeeze(x.detach().numpy()))
    
    #print(x.shape)
    x = self.batchnormCNN1(self.LeakyReLU(self.conv1(x)))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = self.batchnormCNN2(self.LeakyReLU(self.conv2(x)))
    #print(x.shape)
    x = self.pool(x)
    #print(x.shape)
    x = self.flatten(x)
    #print(x.shape)
    x = torch.cat((x,speed),dim=1)
    x = self.batchnormFC1(self.LeakyReLU(self.fc1(x)))
    #print(x.shape)
    x = self.LeakyReLU(self.fc2(x))
    #print(x.shape)
    x = self.fc3(x) 
    #print(x.shape)
    return x
  def get_action(self,state):
    qvals = self.forward(state)
    return torch.argmax(qvals,1) 
  def convert_action(self,action,state):
    # determine if you are going too fast
    speed = get_speed(state).item()
    if(speed>3.5):
      accel = 0
    elif(speed>2.5):
      accel = 0
    else:
      accel = 0.1
    # convert action from index, to a list of turning,engine,breaking strengths
    action = action.item()
    # Discretized action space (left-forward,straight-forward,right-forward)
    if(action == 0):
      return [-0.3,accel,0]
    elif(action == 1):
      return [0,accel,0]
    elif(action == 2):
      return [0.3,accel,0]

      


def update(agent,optimizer,loss_func,target_agent,memory,batch_size):
  agent.train()
  target_agent.eval()
  # current (S,A) Qval
  state,action,reward,next_state = memory.sample_from_experience(batch_size)
  Qvals = agent(state)
  curr_Qval = Qvals[torch.arange(Qvals.size(0)),action.long()]
  
  # best next (S,A) Qval
  with torch.no_grad():
    next_Qval, indices = torch.max(target_agent(next_state),dim=1)

  # update agent
  #print(reward + agent.gamma*next_Qval,curr_Qval)
  loss = loss_func(reward + agent.gamma*next_Qval, curr_Qval)
  loss.backward(retain_graph = False)
  optimizer.step()
  optimizer.zero_grad()



def load_list(filename):
  with open(filename, 'rb') as filehandle:
    l = pickle.load(filehandle)
  return l

def save_list(l,filename):
  with open(filename, 'wb') as filehandle:
    # store the data as binary data stream
    pickle.dump(l, filehandle)

### Training

In [None]:
def load_memory(new,epsilon,exp_replay_size,initial_size=None):
  if(initial_size==None):
    initial_size=exp_replay_size
  # Create the model
  env = gym.make("CarRacing-v0").unwrapped
  agent = DQN_Network()
  if(not new):
    agent.load_state_dict(torch.load("car-racing-dqn.pth"))
  memory = ExperienceReplay(exp_replay_size)

  # initiliaze experience replay
  index = 0
  for i in range(exp_replay_size):
      state = env.reset()
      simulate(agent,env,epsilon = epsilon, memory = memory)
      if(len(memory.experience_replay)>=initial_size):
        break
      print(len(memory.experience_replay))

  return memory




def train(new,epochs,lr,epsilon,gamma,ep_per_epoch,memory):
  # set hyperparamters
  agent = DQN_Network(lr=lr, gamma=gamma)
 
  # start new run
  if(new):
    reward_hist = []; ep_len_hist = []; lr_hist = []; epsilon_hist = []
  # load previous runs
  else:
    agent.load_state_dict(torch.load("car-racing-dqn.pth")); reward_hist = load_list("reward_hist.data");ep_len_hist = load_list("ep_len_hist.data");epsilon_hist = load_list("epsilon_hist.data");lr_hist = load_list("lr_hist.data")

  #initialize models
  target_agent = DQN_Network(agent.lr,agent.gamma)
  target_agent.load_state_dict(agent.state_dict())
  env = gym.make("CarRacing-v0").unwrapped
  optimizer = torch.optim.SGD(agent.parameters(),agent.lr)
  MSELoss = torch.nn.MSELoss()

  # training loop
  for epoch in range(epochs):
    for j in tqdm(range(ep_per_epoch)):
      state, done, losses, ep_len, reward = env.reset(), False, 0, 0, 0
      reward,ep_len = simulate(agent,env,epsilon = epsilon, memory = memory)       
    
      for i in range(0,30):
        update(agent,optimizer,MSELoss,target_agent,memory,batch_size=32)
      target_agent.load_state_dict(agent.state_dict())
    gc.collect(generation=2)


    # save results
    rewards, ep_lens = test_model(agent,env,episodes=10)
    reward_hist.append(rewards);ep_len_hist.append(ep_lens);epsilon_hist.append(epsilon);lr_hist.append(agent.lr)
    torch.save(agent.state_dict(),"car-racing-dqn.pth");save_list(reward_hist,"reward_hist.data");save_list(ep_len_hist,"ep_len_hist.data");save_list(epsilon_hist,"epsilon_hist.data");save_list(lr_hist,"lr_hist.data")

In [None]:
#memory = load_memory(new=False,epsilon=0.1,exp_replay_size=2000)
train(new=False,epochs=1,lr=0.000007,epsilon=0.1,gamma=0.878,ep_per_epoch=30,memory=memory)

In [None]:
env = gym.make("CarRacing-v0").unwrapped
agent = DQN_Network()
agent.load_state_dict(torch.load("car-racing-dqn.pth"))

simulate(agent=agent,env=env,render=True)
#test_model(agent,env,episodes=10)



Track generation: 1111..1393 -> 282-tiles track
reward 861.2064056939342 ep_len 1220


(861.2064056939342, 1220)

In [None]:
print("lr_hist")
hist = load_list("lr_hist.data")
for stage in hist:
  print(stage)
print("\nepsilon_hist")
hist = load_list("epsilon_hist.data")
for stage in hist:
  print(stage)
print("\nep_len_hist")
hist = load_list("ep_len_hist.data")
for stage in hist:
  print(st.mean(stage))
print("\nreward_hist")
hist = load_list("reward_hist.data")
for stage in hist:
  print(st.mean(stage))