In [2]:
import argparse
import math

import gymnasium as gym
import pyglet
from pyglet.window import key

import miniworld
from tqdm import tqdm

import numpy as np
from itertools import count
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
from miniworld.wrappers import PyTorchObsWrapper,GreyscaleWrapper

  from .autonotebook import tqdm as notebook_tqdm


cuda


In [27]:
miniworld.envs.env_ids

['MiniWorld-CollectHealth-v0',
 'MiniWorld-FourRooms-v0',
 'MiniWorld-Hallway-v0',
 'MiniWorld-Maze-v0',
 'MiniWorld-MazeS2-v0',
 'MiniWorld-MazeS3-v0',
 'MiniWorld-MazeS3Fast-v0',
 'MiniWorld-OneRoom-v0',
 'MiniWorld-OneRoomS6-v0',
 'MiniWorld-OneRoomS6Fast-v0',
 'MiniWorld-PickupObjects-v0',
 'MiniWorld-PutNext-v0',
 'MiniWorld-RoomObjects-v0',
 'MiniWorld-Sidewalk-v0',
 'MiniWorld-Sign-v0',
 'MiniWorld-TMaze-v0',
 'MiniWorld-TMazeLeft-v0',
 'MiniWorld-TMazeRight-v0',
 'MiniWorld-ThreeRooms-v0',
 'MiniWorld-WallGap-v0',
 'MiniWorld-YMaze-v0',
 'MiniWorld-YMazeLeft-v0',
 'MiniWorld-YMazeRight-v0']

In [3]:
env = gym.make('MiniWorld-OneRoom-v0', view="agent", render_mode=None)
env = PyTorchObsWrapper(env)

In [10]:

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()

        self.hidden_size = 32
        self.hidden_state = torch.zeros(1, 1, self.hidden_size).to(device)

        self.conv1 = nn.Conv2d(3, 16, 3, stride=2).to(device)
        self.conv2 = nn.Conv2d(16, 32, 3, stride=2).to(device)
        self.maxpool = nn.MaxPool2d(2).to(device)

        self.outconvsize = 2016
        self.affine1 = nn.Linear( self.outconvsize , self.hidden_size).to(device) 
        # self.rnn = nn.RNN(self.hidden_size, self.hidden_size).to(device)

        
        self.action_head = nn.Sequential(
                                    nn.Linear(self.hidden_size, self.hidden_size).to(device),
                                    nn.ReLU(),
                                    nn.Linear(self.hidden_size, 3).to(device)
                                ).to(device)

        self.value_head = nn.Sequential(
                                    nn.Linear(self.hidden_size, self.hidden_size).to(device),
                                    nn.ReLU(),
                                    nn.Linear(self.hidden_size, 1).to(device)
                                ).to(device)

        self.saved_log_probs = []
        self.rewards = []
        self.batch_loss = []

        self.relu = torch.nn.ReLU()
        self.tanh = torch.nn.Tanh()

    def reset_hidden_state(self):
        self.hidden_state = torch.zeros(1, 1, self.hidden_size).to(device)

    def forward(self, x):
        if torch.any(torch.isnan(x)):
            print("NAN in input")
        x = self.relu(self.conv1(x))
        if torch.any(torch.isnan(x)):
            print("NAN in issnput")
        x = self.relu(self.conv2(x))
        x = self.maxpool(x)
        x = x.reshape(-1,self.outconvsize)
        x = self.relu(self.affine1(x))
        #h = self.rnn(x.unsqueeze(0), self.hidden_state)[1]
        #self.hidden_state = h
        h = x
        action_prob = F.softmax(self.action_head(h), dim=-1)

        state_value = self.value_head(h)

        return action_prob, state_value


policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()

def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0).to(device)
    probs,state_value = policy(state)
    m = Categorical(probs)
    action = m.sample()

    policy.saved_log_probs.append( (m.log_prob(action), state_value) )

    return action.item(),probs
    
gamma = 1

def finish_episode():
    R = 0
    policy_loss = []
    value_loss = []
    returns = []
    for r in policy.rewards[::-1]:
        # calculate the discounted value
        R = r + gamma * R
        returns.insert(0, R)
    returns = torch.tensor(returns).to(device)
    # returns = (returns - returns.mean()) / (returns.std() + eps) # this create error ...

    for (log_prob,value), R in zip(policy.saved_log_probs, returns):
        advantage = R - value.item()
        policy_loss.append(-log_prob * advantage)
        value_loss.append(F.smooth_l1_loss(value, torch.tensor([R]).to(device)))
    loss = torch.stack(policy_loss).sum() + torch.stack(value_loss).sum()
    policy.batch_loss.append(loss)
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    policy.reset_hidden_state()


def finish_batch():
    optimizer.zero_grad()
    loss = torch.stack(policy.batch_loss).sum()
    loss.backward()
    optimizer.step()
    del policy.batch_loss[:]
    return loss.item()


In [11]:
def count_parameters(net):
    return sum(p.numel() for p in net.parameters() if p.requires_grad)
count_parameters(policy)

71876

In [12]:
def main():
    running_reward = 0
    for i_episode in range(1000):
        state, _ = env.reset()
        ep_reward = 0
        for t in range(1, 100):  # Don't infinite loop while learning
            action,show_prob = select_action(state)
            state, reward, done, _, _ = env.step(action)
            #if args.render:
            #    env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % 5 == 0:
            loss = finish_batch()
            print(loss,show_prob.cpu().detach().numpy()[0])
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))


In [20]:
main() 

  value_loss.append(F.smooth_l1_loss(value, torch.tensor([R]).to(device)))


-4.733391761779785 [0.3303635  0.34034482 0.32929167]
Episode 0	Last reward: 0.00	Average reward: 0.00
45.933387756347656 [0.33055696 0.3401594  0.32928365]
Episode 5	Last reward: 0.00	Average reward: 0.04
-24.261735916137695 [0.3307736  0.34005666 0.32916972]
Episode 10	Last reward: 0.00	Average reward: 0.03
125.9843521118164 [0.30684444 0.3918668  0.30128875]
Episode 15	Last reward: 0.93	Average reward: 0.15
43.91679763793945 [0.21132502 0.2564801  0.53219485]
Episode 20	Last reward: 0.92	Average reward: 0.21
30.391714096069336 [0.30844256 0.32374692 0.3678106 ]
Episode 25	Last reward: 0.00	Average reward: 0.20
-46.39183807373047 [0.31099382 0.3416136  0.3473926 ]
Episode 30	Last reward: 0.00	Average reward: 0.15
-6.866427898406982 [0.07187061 0.07359064 0.85453874]
Episode 35	Last reward: 0.00	Average reward: 0.20
141.3742218017578 [0.18315165 0.17984113 0.63700724]
Episode 40	Last reward: 0.91	Average reward: 0.28
-2.453948497772217 [0.3308327  0.3398943  0.32927305]
Episode 45	Las

KeyboardInterrupt: 

In [21]:
torch.save(policy.state_dict(), 'miniworld_001.pt')

In [9]:
policy = Policy().to(device)
policy.load_state_dict(torch.load('miniworld_001.pt'))

<All keys matched successfully>

In [29]:
env = gym.make('MiniWorld-OneRoom-v0', view="agent", render_mode="human")
# env = GreyscaleWrapper(env)
env = PyTorchObsWrapper(env)
observation, info = env.reset()

# Create the display window
env.render()

for _ in range(100):
    action,probs = select_action(observation) # agent policy that uses the observation and info
    print(probs)
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()
    if terminated or truncated:
        observation, info = env.reset()
env.close()

tensor([[0.0831, 0.6644, 0.2525]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0477, 0.8542, 0.0981]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0311, 0.8346, 0.1343]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[2.8363e-04, 4.9172e-04, 9.9922e-01]], device='cuda:0',
       grad_fn=<SoftmaxBackward0>)
tensor([[0.0012, 0.0018, 0.9970]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0052, 0.0081, 0.9868]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0357, 0.1140, 0.8503]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0017, 0.0010, 0.9972]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0032, 0.0025, 0.9943]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0042, 0.0150, 0.9808]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0025, 0.0058, 0.9918]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0012, 0.0030, 0.9958]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
tensor([[0.0016, 0.00

In [48]:
env.close()