In [1]:
import gym
from gym import spaces
import copy
import math
import random
import time
import numpy as np
from random import sample
from itertools import count
from IPython import display
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T
import matplotlib.pyplot as plt

import logging

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline


In [2]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    logging.info(f'Running on {torch.cuda.get_device_name(0)}')
else:
    device = torch.device('cpu')

In [3]:
class MyEnv(gym.Env):
    metadata = {'render.modes': ['human']}
    actions = {
        "idle" : 0,
        "left" : 1,
        "right" : 2,
        "up" : 3,
        "down" : 4
    }

    def __init__(self, size, num_obstacles, num_enemies):        
        super(MyEnv, self).__init__()
        self.size = size
        self.num_obstacles = num_obstacles
        self.num_enemies = num_enemies

        self.action_space = spaces.Discrete(5)
        self.hidden = 0
        self.passable = 1
        self.obstacle = 2        
        self.enemy = 3
        self.exit = 4
        self.agent = 5        

        self.grid = None
        self.mask = None
        self.fov_count = None
        self.agent_position = None
        self.previous_positions = set()


    def reset_mask(self):
        self.mask = np.zeros((self.size+2, self.size+2))

    def get_mask(self, reset = False):
        if self.mask is None or reset:
            self.reset_mask()
        x = np.arange(0, 14)
        y = np.arange(0, 14)
        agent_y, agent_x  = self.agent_position[0], self.agent_position[1]
        fov = (x[np.newaxis,:]-agent_x)**2 + (y[:,np.newaxis]-agent_y)**2 < 2**2
        self.mask[fov] = 1

        return self.mask  

    def get_fov_count(self):
        self.fov_count = np.unique(self.mask, return_counts=True)[1][0]

        return self.fov_count

    def reset(self):
        #generates grid
        self.grid = np.ones((self.size+2, self.size+2))*2
        self.grid[1:-1, 1:-1] = self.passable
        #generates obstacles
        for i in range(self.num_obstacles):
            self.grid[(random.randint(1,self.size), random.randint(1,self.size))] = self.obstacle
        #generates enemies
        for i in range(self.num_enemies):
            self.grid[(random.randint(1,self.size), random.randint(1,self.size))] = self.enemy
        #generates exit
        self.grid[(random.randint(1,self.size), random.randint(1,self.size))] = self.exit
        #generates agent position
        self.previous_positions = set()
        self.agent_position = (random.randint(1,self.size), random.randint(1,self.size))        
        self.grid[self.agent_position] = self.agent
        self.previous_positions.add(self.agent_position)
        
        self.mask = self.get_mask(reset = True)
        self.fov_count = self.get_fov_count()        

        return self.grid * self.mask

    def step(self, action):
        reward = 0
        done = False
        fov_count = self.fov_count
        
        agent_y, agent_x = self.agent_position[0], self.agent_position[1]               

        if action == self.actions["idle"]:
            reward -= 0.01
            pass
        elif action == self.actions["left"]:
            agent_y, agent_x = agent_y, agent_x-1            
        elif action == self.actions["right"]:
            agent_y, agent_x = agent_y, agent_x+1                     
        elif action == self.actions["up"]:
            agent_y, agent_x = agent_y-1, agent_x                       
        elif action == self.actions["down"]:
            agent_y, agent_x = agent_y+1, agent_x         

        agent_position_new = (agent_y, agent_x)

        if self.grid[agent_position_new] == self.obstacle:
            reward -= 0.02
            agent_position_new = self.agent_position
        elif self.grid[agent_position_new] == self.enemy:
            agent_position_new = self.agent_position            
            reward -= 1
            done = True
        elif self.grid[agent_position_new] == self.exit:
            reward += 1
            done = True
        elif self.grid[agent_position_new] == self.passable:
            reward += 0.01
            self.grid[self.agent_position] = self.passable
            self.grid[agent_position_new] = self.agent
        
        self.agent_position = agent_position_new        
        if self.agent_position in self.previous_positions:
            reward -= 0.03

        self.previous_positions.add(self.agent_position)
        self.mask = self.get_mask() 
        self.fov_count = self.get_fov_count()
        reward += (0.01 * (fov_count - self.fov_count))

        return self.grid * self.mask, reward, done, {}   

    def render(self, mode='human'):
        return self.grid * self.mask


In [4]:
env = MyEnv(size=12, num_obstacles=16, num_enemies=2)
env.reset()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 2., 5., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [359]:
from time import sleep

sample_state = env.reset()
sample_img = plt.imshow(env.render(mode='rgb_array'))
for j in range(200):    
    sleep(5)
    sample_action = env.action_space.sample()    
    state, reward, done, _ = env.step(sample_action)
    print(f"Sample action: {sample_action}, {list(env.actions.keys())[list(env.actions.values()).index(sample_action)]}, Reward: {reward}, Done: {done}", end = "")
    sample_img.set_data(env.render(mode='rbg_array'))
    plt.axis('off')
    display.display(plt.gcf())
    display.clear_output(wait=True)
    if done:
        break

KeyboardInterrupt: 

In [5]:
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size, seed):
        super(QNetwork,self).__init__()
        self.seed = torch.manual_seed(seed)

        self.conv1 = nn.Conv2d(1, 32, 8, 1)
        self.conv2 = nn.Conv2d(32, 16, 4, 1)
        self.conv3 = nn.Conv2d(16, 8, 2, 1)
        self.flatten = nn.Flatten(start_dim=1)
        self.fc1= nn.Linear(72, 36)        
        self.fc2= nn.Linear(36, 18)
        self.fc3= nn.Linear(18, 8)         
        self.fc4 = nn.Linear(8, action_size)

    def mish(self, x):
        return x*(torch.tanh(F.softplus(x)))
    
    def forward(self, x):
        x = x.unsqueeze(1)    
        x = self.mish(self.conv1(x))    
        x = self.mish(self.conv2(x))
        x = self.mish(self.conv3(x))
        #print(x.size()) 
        x = self.flatten(x)
        x = self.mish(self.fc1(x))        
        x = self.mish(self.fc2(x))
        x = torch.tanh(self.fc3(x))
        
        return self.fc4(x)

In [362]:
# class QNetwork(nn.Module):
#     def __init__(self, state_size, action_size, seed):
#         super(QNetwork,self).__init__()
#         self.seed = torch.manual_seed(seed)

#         self.flatten = nn.Flatten(start_dim=1)
#         self.fc1= nn.Linear(state_size, 128)        
#         self.fc2= nn.Linear(128, action_size)

#     def mish(self, x):
#         return x*(torch.tanh(F.softplus(x)))
    
#     def forward(self, x):
#         x = x.unsqueeze(1)
#         #print(x.size()) 
#         x = self.flatten(x)
#         x = self.mish(self.fc1(x))
        
#         return self.fc2(x)

In [6]:
from collections import namedtuple, deque 

buffer_size = 10000
batch_size = 1024
gamma = 0.99
tau = 0.0001
learning_rate = 0.0001
update_rate = 4

class Agent():    
    def __init__(self, state_size, action_size, seed):       
        
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device)
        self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device)        
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr = learning_rate)
        self.memory = deque(maxlen=buffer_size)
        self.experiences = namedtuple("Experience", field_names=["state",
                                                               "action",
                                                               "reward",
                                                               "next_state",
                                                               "done"])
        self.t_step = 0
        self.total_loss = []
        
    def step(self, state, action, reward, next_state, done):
        self.memory.append(self.experiences(state, action, reward, next_state, done))
        self.t_step = (self.t_step+1) % update_rate
        if self.t_step == 0:
            if len(self.memory) > batch_size:
                #experience = random.sample(self.memory, batch_size)
                experience = [self.memory.popleft() for item in range(batch_size)]
                self.learn(experience, gamma)

    def act(self, state, eps = 0):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        self.qnetwork_local.eval()
        with torch.no_grad():
            action_values = self.qnetwork_local(state)
        self.qnetwork_local.train()
        
        if random.random() > eps:
            return np.argmax(action_values.cpu().data.numpy())
        else:
            return random.choice(np.arange(self.action_size))
            
    def learn(self, experience, gamma):

        states = torch.from_numpy(np.stack([e.state for e in experience if e is not None])).float().to(device)
        actions = torch.from_numpy(np.vstack([e.action for e in experience if e is not None])).long().to(device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experience if e is not None])).float().to(device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experience if e is not None])).float().to(device)
        dones = torch.from_numpy(np.vstack([e.done for e in experience if e is not None]).astype(np.uint8)).float().to(device)

        #criterion = torch.nn.SmoothL1Loss()
        criterion = torch.nn.MSELoss()
        self.qnetwork_local.train()
        self.qnetwork_target.eval()
               
        predicted_targets = self.qnetwork_local(states).gather(1,actions)
    
        with torch.no_grad():
            labels_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1)
        labels = rewards + (gamma*labels_next*(1-dones))
        
        loss = criterion(predicted_targets, labels).to(device)
        self.total_loss.append(loss.item())

        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        self.soft_update(self.qnetwork_local,self.qnetwork_target, tau)
            
    def soft_update(self, local_model, target_model, tau):
        for target_param, local_param in zip(target_model.parameters(),
                                           local_model.parameters()):
            target_param.data.copy_(tau*local_param.data + (1-tau)*target_param.data)


In [7]:
agent = Agent(state_size=196, action_size=5, seed=55555)

def training_loop(n_episodes = 5000000, max_t = 100, eps_start = 1, eps_end = 0.01,
       eps_decay = 0.995):
    scores = []
    scores_window = deque(maxlen=100)
    eps = eps_start
    for i_episode in range(1, n_episodes+1):
        state = env.reset()
        score = 0
        for t in range(max_t):
            action = agent.act(state, eps)
            next_state, reward, done, _ = env.step(action)
            agent.step(state, action, reward, next_state, done)            
            state = next_state
            score += reward
            
            scores_window.append(score)
            scores.append(score)
            if done:
                break            
            
        eps = max(eps*eps_decay, eps_end)
        print(f"\rEpisode {i_episode}\tEpisode Score {score}\tLast iters score {np.mean(scores_window)}", end="")        
        if i_episode % 1000==0:
            print(f"\rEpisode {i_episode}\tAverage Score {np.mean(scores)}\nAverage loss: {np.mean(agent.total_loss)}\tRandom action sample probability: {eps}\n")
            agent.total_loss = []
            
        if np.mean(scores_window) >= 1:
            print(f"\nEnvironment solved in {i_episode-100} epsiodes.\tAverage score: {np.mean(scores_window)}")
            torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')
            break        

    return scores

In [368]:
scores = training_loop()

fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(len(scores)), scores)
plt.ylabel('Score')
plt.xlabel('Epsiode #')
plt.show()

Episode 100	Average Score -0.7987949478025521
Average loss: 0.028834895629967962	Random action sample probability: 0.6057704364907278

Episode 200	Average Score -0.9023618696811893
Average loss: 0.017730641993694007	Random action sample probability: 0.3669578217261671

Episode 300	Average Score -0.9828841597738894
Average loss: 0.012997246347367764	Random action sample probability: 0.22229219984074702

Episode 400	Average Score -1.0861027414176343
Average loss: 0.010586126940324903	Random action sample probability: 0.1346580429260134

Episode 500	Average Score -1.164526120571305
Average loss: 0.006029776918391387	Random action sample probability: 0.08157186144027828

Episode 600	Average Score -1.2541280379371664
Average loss: 0.005134648203642832	Random action sample probability: 0.0494138221100385

Episode 700	Average Score -1.307130503144654
Average loss: 0.004549811076786783	Random action sample probability: 0.029933432588273214

Episode 800	Average Score -1.3719859633890459
Average

KeyboardInterrupt: 

In [396]:
torch.save(agent.qnetwork_local.state_dict(), 'checkpoint.pth')

In [139]:
agent = Agent(state_size = 196, action_size = 5, seed=555555)
agent.qnetwork_local.load_state_dict(torch.load('checkpoint.pth'))

for i in range(3):
    state = env.reset()
    img = plt.imshow(env.render(mode='rgb_array'))
    for j in range(200):
        sleep(0.2)
        action = agent.act(state)
        img.set_data(env.render(mode='rbg_array'))
        plt.axis('off')
        display.display(plt.gcf())
        display.clear_output(wait=True)
        state, reward, done, _ = env.step(action)
        if done:
            break

env.close()

KeyboardInterrupt: 