<a href="https://colab.research.google.com/github/ApoorvaNagarajan/Image-Filter-Kernels/blob/master/p2s10/end_game.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q torch==0.3.1 torchvision

[K     |████████████████████████████████| 496.4MB 35kB/s 
[31mERROR: torchvision 0.6.0+cu101 has requirement torch==1.5.0, but you'll have torch 0.3.1 which is incompatible.[0m
[31mERROR: fastai 1.0.61 has requirement torch>=1.0.0, but you'll have torch 0.3.1 which is incompatible.[0m
[?25h

In [0]:
# Get sand denities from the image

# Importing the libraries
seed=512

import random
import torch
import numpy as np
import os

random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
torch.backends.cudnn.deterministic = True

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.autograd as autograd
from torch.autograd import Variable
from torch.utils.data import Dataset
from PIL import Image as PILImage
import PIL
import math
from PIL import Image, ImageDraw
from matplotlib import pyplot as plt 
import cv2
from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize

from IPython.display import clear_output

In [0]:
class ReplayBuffer(object):

  def __init__(self, max_size=1e6):
    self.storage = []
    self.max_size = max_size
    self.ptr = 0

  def add(self, transition):
    if len(self.storage) == self.max_size:
      self.storage[int(self.ptr)] = transition
      self.ptr = (self.ptr + 1) % self.max_size
    else:
      self.storage.append(transition)

  def sample(self, batch_size):
    ind = np.random.randint(0, len(self.storage), size=batch_size)
    bs_X1, bs_X2, next_bs_X1, next_bs_X2, batch_actions, batch_rewards, batch_dones = [], [], [], [], [], [], []
    for i in ind: 
      state_X1, state_X2, next_state_X1, next_state_X2, action, reward, done = self.storage[i]
      bs_X1.append(np.array(state_X1, copy=False))
      bs_X2.append(np.array(state_X2, copy=False))
      next_bs_X1.append(np.array(next_state_X1, copy=False))
      next_bs_X2.append(np.array(next_state_X2, copy=False))
      batch_actions.append(np.array(action, copy=False))
      batch_rewards.append(np.array(reward, copy=False))
      batch_dones.append(np.array(done, copy=False))
    return np.array(bs_X1), np.array(bs_X2), np.array(next_bs_X1), np.array(next_bs_X2), np.array(batch_actions), np.array(batch_rewards).reshape(-1, 1), np.array(batch_dones).reshape(-1, 1)


In [0]:
class Actor(nn.Module):
  
  def __init__(self, state_dim, action_dim, max_action):
    super(Actor, self).__init__()
    self.conv1 = nn.Conv2d(1, 16, 3, stride=1, padding=(1,1))
    self.conv2 = nn.Conv2d(16, 32, 3, stride=1, padding=(1,1))
    self.conv3 = nn.Conv2d(32, 16, 3, stride=2, padding=(1,1))
    self.conv4 = nn.Conv2d(16, 10, 3, stride=2, padding=(1,1))
    self.fc1 = nn.Linear(state_dim + 10, 400)
    self.fc2 = nn.Linear(400, 300)
    self.fc3 = nn.Linear(300, action_dim)
    self.max_action = max_action

  def forward(self, x1, x2):
    x1 = x1.cuda()
    x2 = x2.cuda()
    h = F.relu(self.conv1(x1))              # 32x32x16
    h = F.relu(self.conv2(h))               # 32x32x32
    h = F.relu(self.conv3(h))               # 16x16x16
    h = F.relu(self.conv4(h))               # 8x8x10
    h = F.avg_pool2d(h, h.size()[2:])       # 10
    h = h.view(-1, 10)
    h = torch.cat([h, x2], dim=1)
    h = F.relu(self.fc1(h))
    h = F.relu(self.fc2(h))
    h = self.max_action * torch.tanh(self.fc3(h))
    return h

class Critic(nn.Module):
  
  def __init__(self, state_dim, action_dim):
    super(Critic, self).__init__()
    # Defining the first Critic neural network
    self.conv1 = nn.Conv2d(1, 16, 3, stride=1, padding=(1,1))
    self.conv2 = nn.Conv2d(16, 32, 3, stride=1, padding=(1,1))
    self.conv3 = nn.Conv2d(32, 16, 3, stride=2, padding=(1,1))
    self.conv4 = nn.Conv2d(16, 10, 3, stride=2, padding=(1,1))
    self.fc1 = nn.Linear(state_dim + 10 + action_dim, 400)
    self.fc2 = nn.Linear(400, 300)
    self.fc3 = nn.Linear(300, 1)
    # Defining the second Critic neural network
    self.conv5 = nn.Conv2d(1, 16, 3, stride=1, padding=(1,1))
    self.conv6 = nn.Conv2d(16, 32, 3, stride=1, padding=(1,1))
    self.conv7 = nn.Conv2d(32, 16, 3, stride=2, padding=(1,1))
    self.conv8 = nn.Conv2d(16, 10, 3, stride=2, padding=(1,1))
    self.fc4 = nn.Linear(state_dim + 10 + action_dim, 400)
    self.fc5 = nn.Linear(400, 300)
    self.fc6 = nn.Linear(300, 1)

  def forward(self, x1, x2, u):
    x1 = x1.cuda()
    x2 = x2.cuda()
    u = u.cuda()
    h1 = F.relu(self.conv1(x1))               # 32x32x16
    h1 = F.relu(self.conv2(h1))               # 32x32x32
    h1 = F.relu(self.conv3(h1))               # 16x16x16
    h1 = F.relu(self.conv4(h1))               # 8x8x10
    h1 = F.avg_pool2d(h1, h1.size()[2:])       # 10
    h1 = h1.view(-1, 10)
    h1 = torch.cat([h1, x2], dim=1)
    hu1 = torch.cat([h1, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    hu1 = F.relu(self.fc1(hu1))
    hu1 = F.relu(self.fc2(hu1))
    hu1 = self.fc3(hu1)
    # Forward-Propagation on the second Critic Neural Network
    h2 = F.relu(self.conv5(x1))              # 32x32x16
    h2 = F.relu(self.conv6(h2))              # 32x32x32
    h2 = F.relu(self.conv7(h2))               # 16x16x16
    h2 = F.relu(self.conv8(h2))               # 8x8x10
    h2 = F.avg_pool2d(h2, h2.size()[2:])       # 10
    h2 = h2.view(-1, 10)
    h2 = torch.cat([h2, x2], dim=1)
    hu2 = torch.cat([h2, u], dim=1)
    hu2 = F.relu(self.fc4(hu2))
    hu2 = F.relu(self.fc5(hu2))
    hu2 = self.fc6(hu2)
    return hu1, hu2

  def Q1(self, x1, x2, u):
    x1 = x1.cuda()
    x2 = x2.cuda()
    u = u.cuda()
    h1 = F.relu(self.conv1(x1))               # 32x32x16
    h1 = F.relu(self.conv2(h1))               # 32x32x32
    h1 = F.relu(self.conv3(h1))               # 16x16x16
    h1 = F.relu(self.conv4(h1))               # 8x8x10
    h1 = F.avg_pool2d(h1, h1.size()[2:])       # 10
    h1 = h1.view(-1, 10)
    h1 = torch.cat([h1, x2], dim=1)
    hu1 = torch.cat([h1, u], 1)
    # Forward-Propagation on the first Critic Neural Network
    hu1 = F.relu(self.fc1(hu1))
    hu1 = F.relu(self.fc2(hu1))
    hu1 = self.fc3(hu1)
    return hu1

In [0]:
class TD3(object):
  
  def __init__(self, state_dim, action_dim, max_action):
    self.actor = Actor(state_dim, action_dim, max_action)
    self.actor.cuda()
    self.actor_target = Actor(state_dim, action_dim, max_action)
    self.actor_target.load_state_dict(self.actor.state_dict())
    self.actor_target.cuda()
    self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
    self.critic = Critic(state_dim, action_dim)
    self.critic.cuda()
    self.critic_target = Critic(state_dim, action_dim)
    self.critic_target.load_state_dict(self.critic.state_dict())
    self.critic_target.cuda()
    self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
    self.max_action = max_action
    self.replay_buffer = ReplayBuffer()
    self.total_timesteps = 0
    self.episode_reward = 0
    self.episode_num = 0
    self.episode_timesteps = 0

  def select_action(self, X1, X2):
        #print(X2)
        if(self.episode_num < start_timesteps):
            #print("random action ", self.total_timesteps)
            action = np.zeros(1)
            action[0] = np.random.uniform(-1, 1)
            return action #np.random.randint(-1,1, size=1)
        else:
            #print("nw action ", self.total_timesteps)
            X1 = torch.Tensor(X1.reshape(1, 1, 32, 32))
            X2 = torch.Tensor(np.asarray(X2).reshape(1, -1))
            return self.actor(Variable(X1, volatile = True), Variable(X2, volatile = True)).cpu().data.numpy().flatten()

  def train(self, iterations, batch_size=100, discount=0.99, tau=0.005, policy_noise=0.1, noise_clip=0.5, policy_freq=2):
    
    avg_loss = 0

    for it in range(iterations):
      
      # Step 4: We sample a batch of transitions (s, s’, a, r) from the memory
      bs_X1, bs_X2, next_bs_X1, next_bs_X2, batch_actions, batch_rewards, batch_dones = self.replay_buffer.sample(batch_size)
      X1 = Variable(torch.Tensor(bs_X1), volatile = False)
      X2 = Variable(torch.Tensor(bs_X2), volatile = False)
      next_X1 = Variable(torch.Tensor(next_bs_X1), volatile = False)
      next_X2 = Variable(torch.Tensor(next_bs_X2), volatile = False)
      action = Variable(torch.Tensor(batch_actions), volatile = False)
      reward = Variable(torch.Tensor(batch_rewards), volatile = True)
      done = Variable(torch.Tensor(batch_dones), volatile = True)

      #print("X1 ", X1)
      #print("X2 ", X2)
      
      # Step 5: From the next state s’, the Actor target plays the next action a’
      next_action = self.actor_target(next_X1, next_X2)
      
      # Step 6: We add Gaussian noise to this next action a’ and we clamp it in a range of values supported by the environment
      noise = Variable(torch.Tensor(batch_actions), volatile = True).data.normal_(0, policy_noise)
      noise = Variable(noise.clamp(-noise_clip, noise_clip), volatile = True)
      next_action = (next_action + noise.cuda()).clamp(-self.max_action, self.max_action)
      
      # Step 7: The two Critic targets take each the couple (s’, a’) as input and return two Q-values Qt1(s’,a’) and Qt2(s’,a’) as outputs
      target_Q1, target_Q2 = self.critic_target(next_X1, next_X2, next_action)

      #print("target_Q1 ", target_Q1, " target_Q2 ", target_Q2)
      
      # Step 8: We keep the minimum of these two Q-values: min(Qt1, Qt2)
      target_Q = torch.min(target_Q1, target_Q2).cuda()
      
      # Step 9: We get the final target of the two Critic models, which is: Qt = r + γ * min(Qt1, Qt2), where γ is the discount factor
      target_Q = reward.cuda() + ((1 - done.cuda()) * discount * target_Q).detach()

      #print("target_Q ", target_Q)
      
      # Step 10: The two Critic models take each the couple (s, a) as input and return two Q-values Q1(s,a) and Q2(s,a) as outputs
      current_Q1, current_Q2 = self.critic(X1, X2, action)

      #print("action ", action)

      #print("current_Q1 ", current_Q1 , " current_Q2 ", current_Q2)
      
      # Step 11: We compute the loss coming from the two Critic models: Critic Loss = MSE_Loss(Q1(s,a), Qt) + MSE_Loss(Q2(s,a), Qt)
      critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q)

      avg_loss += critic_loss
      avg_loss.detach()
      
      #print("critic loss ",critic_loss)

      # Step 12: We backpropagate this Critic loss and update the parameters of the two Critic models with a SGD optimizer
      self.critic_optimizer.zero_grad()
      critic_loss.backward()
      self.critic_optimizer.step()
      
      # Step 13: Once every two iterations, we update our Actor model by performing gradient ascent on the output of the first Critic model
      if it % policy_freq == 0:
        actor_loss = -self.critic.Q1(X1, X2, self.actor(X1, X2)).mean()
        #print("actor_loss ",actor_loss)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        # Step 14: Still once every two iterations, we update the weights of the Actor target by polyak averaging
        for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)
        
        # Step 15: Still once every two iterations, we update the weights of the Critic target by polyak averaging
        for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
          target_param.data.copy_(tau * param.data + (1 - tau) * target_param.data)

    print("AVG LOSSSSSSSSSSSSSSSSSS ", avg_loss/iterations)
    for param_group in self.critic_optimizer.param_groups:
        print(param_group['lr'])
		  
    
  def add_replay_buff(self, X1, X2, new_X1, new_X2, action, reward, done_flag):
        self.episode_reward += reward
        # if reward is lesser than min reward, end the episode
        if(self.episode_reward<min_episode_reward):
            done_flag = 1
        #print(X2)
        #plt.imshow(X1.reshape(32,32), cmap=plt.get_cmap('gray'))
        #plt.show()
        self.replay_buffer.add((X1, X2, new_X1, new_X2, action, reward, done_flag))
        self.total_timesteps += 1
        self.episode_timesteps += 1
        # If episode is done, train the model
        if (done_flag == 1):
            print(self.episode_num, " : EPISODE REWARD ", self.episode_reward, " timesteps ", self.total_timesteps)
            self.train(self.episode_timesteps)
            self.episode_reward = 0
            self.episode_num += 1
            self.episode_timesteps = 0
            #print(done_flag)
        return done_flag

  # Making a save method to save a trained model
  def save(self):
    torch.save(self.actor.state_dict(), '/content/drive/My Drive/models/last_actor.pth')
    torch.save(self.critic.state_dict(), '/content/drive/My Drive/models/last_critic.pth')
  
  # Making a load method to load a pre-trained model
  def load(self, filename, directory):
    if os.path.isfile('last_actor.pth'):
        self.actor.load_state_dict(torch.load('last_actor.pth'))
    if os.path.isfile('last_critic.pth'):
        self.critic.load_state_dict(torch.load('last_critic.pth' ))

In [0]:
def init():
    global sand
    global img
    global goal_x
    global goal_y
    global first_update
    global map_width
    global map_height

    map_width = 1429
    map_height = 660
    
    # Read the mask image
    sand = np.zeros((map_height,map_width))
    img = cv2.imread("MASK1.png",0) 
    sand = img/255
          
    goal_x = 1197
    goal_y = 512
    first_update = False
    global swap
    swap = 0
    global done_flag
    done_flag = 0
    global total_timesteps
    total_timesteps = 0

In [0]:
class Car(object):
    
    def __init__(self):
      self.angle = 0.0
      self.rotation = 0.0
      self.velocity_x = 0.0
      self.velocity_y = 0.0
      self.pos_x = 0.0
      self.pos_y = 0.0

    def move(self, rotation):
      
        self.pos_x = self.velocity_x + self.pos_x
        self.pos_y = self.velocity_y + self.pos_y
        #print("velocity " , self.velocity_x, " " , self.velocity_y)
        #print("pos " , self.pos_x , " " , self.pos_y)
        self.rotation = float(rotation)
        self.angle = (self.angle + self.rotation)
        #print("rot ", self.rotation)
        #print("angle ", self.angle)
        
    def reset(self):
        print("RESETTING")
        self.pos_x = np.random.randint(80, map_width-80, size=1)[0]
        self.pos_y = np.random.randint(80, map_height-80, size=1)[0]
        print("pos_x ", self.pos_x, "pos_y ", self.pos_y)

In [0]:
def rotate(vector_x, vector_y, angle):
  angle = math.radians(angle)
  return (vector_x * math.cos(angle)) - (vector_y * math.sin(angle)), (vector_y * math.cos(angle)) + (vector_x * math.sin(angle))

In [0]:
def get_angle(vector1_x, vector1_y, vector2_x, vector2_y):
  angle = -(180 / math.pi) * math.atan2(vector1_x * vector2_y - vector1_y * vector2_x, vector1_x * vector2_x + vector1_y * vector2_y)
  return angle

In [0]:
class Game(object):

    def __init__(self):
      self.car = Car()
      self.goal_x = 0
      self.goal_y = 0
      init()
      self.surr = self.get_surroundings()

    def serve_car(self):
        #self.car.center = self.center
        self.car.velocity_x = 6
        self.car.velocity_y = 0
        self.car.pos_x = map_width/2
        self.car.pos_y = map_height/2

        
    def get_surroundings(self):
        
        crop_img = sand[map_height-1-int(self.car.pos_y)-crop_size: map_height-1- int(self.car.pos_y)+crop_size, int(self.car.pos_x)-crop_size:int(self.car.pos_x)+crop_size].copy()
       
        top = 0
        bottom = 0
        left = 0
        right = 0
         
        # if at frame boundary, pad the cropped image with sand (1's)
        if(crop_img.shape[0] != 2*crop_size): # rows
            if(self.car.pos_y < crop_size):
                bottom = 2*crop_size - crop_img.shape[0]
            else:
                top = 2*crop_size - crop_img.shape[0]
            
        if(crop_img.shape[1] != 2*crop_size): # colums
            if(self.car.pos_x < crop_size):
                left = 2*crop_size - crop_img.shape[1]
            else:
                right = 2*crop_size - crop_img.shape[1]            
            
        if((top != 0) or (bottom != 0) or (left != 0) or (right != 0)):
            crop_img = cv2.copyMakeBorder(crop_img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=1 )    
        #cv2.imshow("crop_img",crop_img)
        #cv2.waitKey(0) 
        #plt.imshow(crop_img, cmap=plt.get_cmap('gray'))
        #plt.show()


        pt1 = rotate(0, 10, -self.car.angle)
        pt2 = rotate(10, 10, -self.car.angle)
        pt3 = rotate(30, 0, -self.car.angle)
        pt4 = rotate(10, -10, -self.car.angle)
        pt5 = rotate(7, -10, -self.car.angle)
        pt6 = rotate(7, -30, -self.car.angle)
        pt7 = rotate(3, -30, -self.car.angle)
        pt8 = rotate(3, -10, -self.car.angle)
        pt9 = rotate(0, -10, -self.car.angle)

        triangle_cnt = np.array( [pt1, pt2, pt3, pt4, pt5, pt6, pt7, pt8, pt9] )
        for i in range(0,9):
          for j in range(0,2):
            triangle_cnt[i][j] += crop_size
        ctr = np.array(triangle_cnt).reshape((-1,9,2)).astype(np.int32)
        cv2.fillPoly(crop_img, pts =ctr, color=0.5)     
        #cv2.imshow("Car",crop_img)
        #cv2.waitKey(0) 
        #plt.imshow(crop_img, cmap=plt.get_cmap('gray'))
        #plt.show()
        
        rsz_img = cv2.resize(crop_img, (32,32), interpolation = cv2.INTER_AREA)
        #cv2.imshow("resized_image",rsz_img)
        #cv2.waitKey(0) 
        #plt.imshow(rsz_img, cmap=plt.get_cmap('gray'))
        #plt.show()
        
        rsz_img = rsz_img.reshape(1, 32, 32)

        return rsz_img

    def update(self):

        global brain
        global last_reward
        global scores
        global last_distance
        global goal_x
        global goal_y
        global map_width
        global map_height
        global swap
        global done_flag
        global total_timesteps
        
        xx = goal_x - self.car.pos_x
        yy = goal_y - self.car.pos_y
        distance = np.sqrt((self.car.pos_x - goal_x)**2 + (self.car.pos_y - goal_y)**2)
        orientation = get_angle(self.car.velocity_x, self.car.velocity_y, xx, yy)/180.
        
        
        # states : 
        #32x32 cropped image with car overlay
        #orientation
        #-orientation
        #distance_x from goal
        #distance_y from goal       
        X1 = self.surr       
        X2 = [orientation, -orientation, distance/1574]

        # actions:
        # angle theta of rotation       
        action = brain.select_action(X1, X2)

        if(1.0 != action[0]):
          print(action[0])

        self.car.move(action[0]) 
        on_road = 0

        if self.car.pos_x < border_size:
            self.car.pos_x = border_size
            last_reward = -30
            print("LEFT BORDERRRRRRRRRRRRRRRRRRRRR")
            done_flag = 1
        if self.car.pos_x > map_width - border_size:
            self.car.pos_x = map_width - border_size
            last_reward = -30
            print("RIGHT BORDERRRRRRRRRRRRRRRRRRRRR")
            done_flag = 1
        if self.car.pos_y < border_size:
            self.car.pos_y = border_size
            last_reward = -30
            print("TOP BORDERRRRRRRRRRRRRRRRRRRRR")
            done_flag = 1
        if self.car.pos_y > map_height - border_size:
            self.car.pos_y = map_height - border_size
            last_reward = -30
            print("BOTTOM BORDERRRRRRRRRRRRRRRRRRRRR")
            done_flag = 1

        if(0 == done_flag):
        
            # velocity
            if sand[map_height-1-int(self.car.pos_y), int(self.car.pos_x)] > 0:
                self.car.velocity_x, self.car.velocity_y = rotate(0.5, 0, self.car.angle)
                on_road = 0
                #print("SAND")
            else: # otherwise
                self.car.velocity_x, self.car.velocity_y = rotate(1, 0, self.car.angle)
                on_road = 1
                #print("ROAD")
            
            new_xx = goal_x - self.car.pos_x
            new_yy = goal_y - self.car.pos_y
            new_orient = get_angle(self.car.velocity_x, self.car.velocity_y, new_xx, new_yy)/180.
            new_X1 = self.get_surroundings()
            distance = np.sqrt((self.car.pos_x - goal_x)**2 + (self.car.pos_y - goal_y)**2)
            new_X2 = [new_orient, -new_orient, distance/1574]
            self.surr = new_X1

            # Rewards

            if((on_road == 1) and (distance < last_distance)):
                last_reward = 1
            elif((on_road == 0) and (distance < last_distance)):
                last_reward = -15
            elif((on_road == 1) and (distance > last_distance)):
                last_reward = -10
            elif((on_road == 0) and (distance > last_distance)):
                last_reward = -25
        else:  

            # Rewards
            distance = np.sqrt((self.car.pos_x - goal_x)**2 + (self.car.pos_y - goal_y)**2)
            new_X1 = X1
            new_X2 = X2

        if distance < 25:
            print("GOALLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL REACHEDDDDDDDDDDDDDDDDDD")
            if swap == 1:
                goal_x = 1197
                goal_y = 512
                swap = 0
                done_flag = 1
            else:
                goal_x = 361
                goal_y = 311
                swap = 1
                done_flag = 1
                
        last_distance = distance

        done_flag = brain.add_replay_buff(X1, X2, new_X1, new_X2, action, last_reward, done_flag)

        if(done_flag == 1):
            self.car.reset()
            self.surr = self.get_surroundings()
            done_flag = 0

        total_timesteps += 1

In [0]:
start_timesteps = 50 #1e4 # Number of iterations/timesteps before which the model randomly chooses an action, and after which it starts to use the policy network
eval_freq = 5e3 # How often the evaluation step is performed (after how many timesteps)
max_timesteps = 5e5 # Total number of iterations/timesteps
save_models = True # Boolean checker whether or not to save the pre-trained model
expl_noise = 0.1 # Exploration noise - STD value of exploration Gaussian noise
batch_size = 100 # Size of the batch
discount = 0.99 # Discount factor gamma, used in the calculation of the total discounted reward
tau = 0.005 # Target network update rate
policy_noise = 0.2 # STD of Gaussian noise added to the actions for the exploration purposes
noise_clip = 0.5 # Maximum value of the Gaussian noise added to the actions (policy)
policy_freq = 2 # Number of iterations to wait before the policy network (Actor model) is updated	
min_episode_reward = -100000

In [0]:
first_update = True
crop_size = 80
border_size = 5
brain = TD3(3,1,5)
last_distance = 0

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [18]:

init()
game = Game()
game.serve_car()
max_num_timesteps = 500000
timesteps = 0

while (1):
  game.update()
  timesteps += 1
  #clear_output()
  if(timesteps >= max_num_timesteps):
    break




[1;30;43mStreaming output truncated to the last 5000 lines.[0m
0.9102697
0.55630535
-1.3718623
-1.0028901
-1.0815449
-0.8802345
0.8600836
-0.20496882
0.12394644
0.08298287
-2.6460564
-0.3029983
-1.5042245
-2.3149304
-0.7260592
0.25805238
-1.4186108
-3.9320283
-3.4196372
-2.5718474
-1.7213078
-1.2094297
-0.57815427
-0.7673476
1.6459296
0.1889559
1.7405257
1.4459982
0.8332413
2.5074296
1.9978666
3.144464
3.1975536
2.3032794
1.2997879
1.4445784
-2.1255817
-2.1668606
-1.205157
-1.5278589
-2.3649344
-1.3106153
0.49233067
0.38583967
2.0899255
2.9894934
4.492484
4.3663077
4.187146
2.2086704
1.7302408
-2.2287166
1.3896666
3.0377345
0.37080002
-2.0220313
-1.2055112
0.75543976
-1.4186022
0.90345854
0.5260437
2.654211
2.743204
-2.9426944
3.6390424
3.9280508
3.3369243
3.1990275
-1.5381955
-4.9414225
-4.969054
-4.838917
-4.69837
-3.8765275
-2.125892
-4.607982
-1.9371957
0.26419118
-2.6245494
-3.4055228
-2.6880622
-0.21789113
1.897878
0.799695
0.31671023
0.15365483
-0.20288485
0.41290504
0.1724866

KeyboardInterrupt: ignored

In [0]:
brain.save()

In [0]:
brain.actor.cpu()
brain.critic.cpu()
torch.save(brain.actor.state_dict(), 'last_actor.pth')
torch.save(brain.critic.state_dict(), 'last_critic.pth')