In [3]:
import argparse
import math

import gymnasium as gym
import pyglet
from pyglet.window import key

import miniworld
from tqdm import tqdm

import numpy as np
from itertools import count
from collections import deque
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical

device = 'cpu' if torch.cuda.is_available() else 'cpu'
print(device)

  from .autonotebook import tqdm as notebook_tqdm


cpu


In [4]:
env = gym.make('MiniWorld-OneRoom-v0', view="agent", render_mode=None)

In [5]:

class Policy(nn.Module):
    def __init__(self):
        super(Policy, self).__init__()

        self.hidden_size = 64
        self.hidden_state = torch.zeros(1, 1, self.hidden_size).to(device)

        self.conv1 = nn.Conv2d(3, 8, 6, stride=2).to(device)
        self.conv2 = nn.Conv2d(8, 16, 6, stride=2).to(device)
        self.maxpool = nn.MaxPool2d(4, stride=2).to(device)
        
        # self.conv1 = nn.Conv2d(3, 32, 4, stride=2).to(device)
        # self.conv2 = nn.Conv2d(32, 64, 4, stride=2).to(device)
        # self.conv3 = nn.Conv2d(64, 128, 4, stride=2).to(device)
        # self.conv4 = nn.Conv2d(128, 256, 4, stride=2).to(device)

        self.outconvsize = 16*5*7 # 3*256
        
        self.affine1 = nn.Linear( self.outconvsize , self.hidden_size).to(device) 
        
        self.rnn = nn.RNN(self.hidden_size, self.hidden_size).to(device)
        
        self.affine2 = nn.Linear(self.hidden_size, 3).to(device)

        self.saved_log_probs = []
        self.rewards = []

        self.relu = torch.nn.ReLU()
        self.tanh = torch.nn.Tanh()

    def reset_hidden_state(self):
        self.hidden_state = torch.zeros(1, 1, self.hidden_size).to(device)

    def forward(self, x):
        x = self.relu(self.conv1(x))
        x = self.conv2(x)
        x = self.relu(self.maxpool(x))
        # x = F.relu(self.conv3(x))
        # x = F.relu(self.conv4(x))
        x = x.reshape(-1,self.outconvsize )
        x = self.relu(self.affine1(x))
        h = self.rnn(x.unsqueeze(0), self.hidden_state)[1]
        self.hidden_state = h
        action_scores = self.tanh(self.affine2(h))
        return F.softmax(action_scores, dim=1)


policy = Policy().to(device)
optimizer = optim.Adam(policy.parameters(), lr=1e-3)
eps = np.finfo(np.float32).eps.item()

def select_action(state):
    state = torch.from_numpy(state).float().unsqueeze(0).permute(0,3,1,2).to(device)
    probs = policy(state)
    m = Categorical(probs)
    action = m.sample()
    policy.saved_log_probs.append(m.log_prob(action))
    return action.item(),probs
    
gamma = 1

def finish_episode():
    R = 0
    policy_loss = []
    returns = deque()
    for r in policy.rewards[::-1]:
        R = r + gamma * R
        returns.appendleft(R)
    returns = torch.tensor(returns)
    # returns = (returns - returns.mean()) / (returns.std() + eps) # this create error ...
    for log_prob, R in zip(policy.saved_log_probs, returns):
        policy_loss.append(-log_prob * R)
    optimizer.zero_grad()
    policy_loss = torch.cat(policy_loss).sum()
    policy_loss.backward()
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_log_probs[:]
    policy.reset_hidden_state()


In [6]:
def count_parameters(net):
    return sum(p.numel() for p in net.parameters() if p.requires_grad)
count_parameters(policy)

50370

In [7]:
def main():
    running_reward = 0
    for i_episode in range(1000):
        state, _ = env.reset()
        ep_reward = 0
        for t in range(1, 100):  # Don't infinite loop while learning
            action,probs = select_action(state)
            state, reward, done, _, _ = env.step(action)
            #if args.render:
            #    env.render()
            policy.rewards.append(reward)
            ep_reward += reward
            if done:
                break
        running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward
        finish_episode()
        if i_episode % 10 == 0:

            
            print('Episode {}\tLast reward: {:.2f}\tAverage reward: {:.2f}'.format(
                  i_episode, ep_reward, running_reward))
            print(probs)
        

In [8]:
main() 

Episode 0	Last reward: 0.00	Average reward: 0.00
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 10	Last reward: 0.00	Average reward: 0.00
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 20	Last reward: 0.00	Average reward: 0.00
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 30	Last reward: 0.00	Average reward: 0.03
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 40	Last reward: 0.00	Average reward: 0.06
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 50	Last reward: 0.00	Average reward: 0.13
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 60	Last reward: 0.00	Average reward: 0.08
tensor([[[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]],
       grad_fn=<SoftmaxBackward0>)
Episode 70	Last reward: 0.00	Averag

KeyboardInterrupt: 

In [9]:
env = gym.make('MiniWorld-OneRoomS6-v0', view="agent", render_mode="human")
observation, info = env.reset()

# Create the display window
env.render()

for _ in range(100):
    action,_ = select_action(observation) # agent policy that uses the observation and info
    print(action)
    observation, reward, terminated, truncated, info = env.step(action)
    env.render()
    if terminated or truncated:
        observation, info = env.reset()
env.close()

2
0
5
6
5
0
8
8
6
2
4
1
1
3
1
6
7
5
0
8
8
3
2
4
7
1
4
0
2
5
2
9
0
1
2
9
6
1
0
0
5
0
2
0
8
2
4
0
3
2
0
4
6
5
2
1
4
6
0
6
9
1
0
4
3
1
7
0
0
1
1
0
5
3
8
2
9
9
5
3
8
9
4
2
8
7
6
5
6
9
6
3
0
4
8
7
6
1
6
3
