In [1]:
import gym
import numpy as np
from IPython.display import clear_output
from time import sleep
import torch

In [2]:
def set_seed(env, seed=23):
    env.seed(seed=seed)
    env.action_space.seed(seed=seed)
    env.observation_space.seed(seed=seed)
    env.reset()

    

my_map = ["SFFFHFFHFFFH",
          "FFFFFFFFFFFF",
          "FFFFFFFFFFFF",
          "FFFFFFHFFFFH",
          "HFFFFFFFFFFF",
          "FHFFFFHFFFFF",
          "FFFFFFFFFFFF",
          "HFHFFHFFFFFH",
          "FFFFFFFHFFFF",
          "FFFFFFFFFFFF",
          "HFFFFFFFFFFH",
          "FFFFHFFFFFFG"]


env = gym.make("FrozenLake-v0", desc=my_map)
env._max_episode_steps = 300
set_seed(env)

In [3]:
env.render()


[41mS[0mFFFHFFHFFFH
FFFFFFFFFFFF
FFFFFFFFFFFF
FFFFFFHFFFFH
HFFFFFFFFFFF
FHFFFFHFFFFF
FFFFFFFFFFFF
HFHFFHFFFFFH
FFFFFFFHFFFF
FFFFFFFFFFFF
HFFFFFFFFFFH
FFFFHFFFFFFG


In [4]:
n_actions = env.action_space.n
n_states = env.observation_space.n
print(f"Nr. state: {n_states}, Nr. actions: {n_actions}")

Nr. state: 144, Nr. actions: 4


In [5]:
# model dynamics
print(env.P[0])

{0: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 12, 0.0, False)], 1: [(0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 12, 0.0, False), (0.3333333333333333, 1, 0.0, False)], 2: [(0.3333333333333333, 12, 0.0, False), (0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False)], 3: [(0.3333333333333333, 1, 0.0, False), (0.3333333333333333, 0, 0.0, False), (0.3333333333333333, 0, 0.0, False)]}


In [6]:
class RandomAgent():
    def __init__(self, env):
        self.env = env
        
    def take_action(self, state):
        return self.env.action_space.sample()

In [7]:
def run_agent(env, agent, episodes=3):
    captured_frames = []
    total_n_steps = 0
    total_n_fails = 0
    total_return_val = 0
    
    for episode in range(episodes):
        state = env.reset()
        n_steps, n_fails, return_val = 0, 0, 0
        done = False

        # one episode
        while not done:
            action = agent.take_action(state)
            state, reward, done, info = env.step(action)

            return_val += reward
            total_return_val += reward
            
            # falling in a hole
            if reward == 0 and done == True:
                n_fails += 1
                total_n_fails += 1 
                

            n_steps += 1
            total_n_steps += 1
            
            captured_frames.append({
                'frame': env.render(mode=r'ansi'),
                'state': state,
                'action': action,
                'reward': reward
            })

    return n_steps, n_fails, return_val, captured_frames

In [8]:
rnd_agent = RandomAgent(env)
n_steps, n_fails, G, captured_frames = run_agent(env, rnd_agent)
print(n_steps)
print(n_fails)
print(G)

19
1
0.0


In [9]:
def animate_agent(frames, verbose, delay):

    for idx, data in enumerate(frames):
        clear_output(wait=True)
        print(data['frame'])

        if verbose:
            print(f"Step No.: {idx+1}")
            print(f"State ID: {data['state']}")
            print(f"Action ID: {data['action']}")
            print(f"Reward: {data['reward']}")
        sleep(delay)


In [10]:
animate_agent(frames=captured_frames, verbose=True, delay=0.001)

  (Left)
SFFFHFFHFFFH
FFFFFFFFFFFF
FFFFFFFFFFFF
FFFFFFHFFFFH
[41mH[0mFFFFFFFFFFF
FHFFFFHFFFFF
FFFFFFFFFFFF
HFHFFHFFFFFH
FFFFFFFHFFFF
FFFFFFFFFFFF
HFFFFFFFFFFH
FFFFHFFFFFFG

Step No.: 71
State ID: 48
Action ID: 0
Reward: 0.0


In [11]:
class QAgent():
    def __init__(self, evn):
        self.env = env 
        # rows are states, columns are actions
        self.Q_table = np.zeros([env.observation_space.n, 
                            env.action_space.n])
    
    def take_action(self, state):
        # choose action with the max value 
        action = np.argmax(self.Q_table[state])
        return action

In [12]:
def q_learning(env, agent, episodes=20000, alpha=0.1, eps=0.1, gamma=0.99):

    for episode in range(episodes):
        state = env.reset()
        done = False
        n_steps = 1

        # one episode
        while not done:
            # exploration vs. exploitation
            p = np.random.random()
            if p > eps:
                action = agent.take_action(state)
            else:
                action = env.action_space.sample()

            old_value = agent.Q_table[state, action]
            old_state = state
            
            state, reward, done, info = env.step(action)
               
            # falling in a hole
            if reward == 0 and done == True:
                reward = -1
            elif done == True:
                reward = 100
            
            new_value = old_value + alpha * (reward + gamma * np.max(agent.Q_table[state]) - old_value)
            agent.Q_table[old_state, action] = new_value
            n_steps += 1

    return agent

In [13]:
q_agent = QAgent(env)
q_agent = q_learning(env, q_agent)

In [14]:
n_steps, n_fails, G, captured_frames = run_agent(env, q_agent, episodes=2)

In [15]:
animate_agent(frames=captured_frames, verbose=True, delay=0.1)

  (Up)
SFFFHFFHFFFH
FFFFFFFFFFFF
FFFFFFFFFFFF
FFFFFFHFFFFH
HFFFFFFFFFFF
FHFFFFHFFFFF
FFFFFFFFFFFF
HFHFFHFFFFFH
FFFFFFFHFFFF
FFFFFFFFFFFF
HFFFFFFFFFFH
FFFFHFFFFFF[41mG[0m

Step No.: 129
State ID: 143
Action ID: 3
Reward: 1.0


In [16]:
n_actions = env.action_space.n
n_states = env.observation_space.n
print(f"Nr. state: {n_states}, Nr. actions: {n_actions}")

Nr. state: 144, Nr. actions: 4


In [17]:
class BCAgent(torch.nn.Module):
    def __init__(self, evn):
        super().__init__()
        self.env = env 
        # use neural network as policy
        self.policy = torch.nn.Sequential(torch.nn.Linear(self.env.observation_space.n, 512),
                                               torch.nn.SELU(),
                                               torch.nn.Dropout(p=0.5),
                                               torch.nn.Linear(512, 512),
                                               torch.nn.SELU(),
                                               torch.nn.Dropout(p=0.55),
                                               torch.nn.Linear(512, self.env.action_space.n))
                                               
    
    def forward_(self, x):
        return self.policy(x)
    
    
    @torch.no_grad()
    def take_action(self, state):
        self.policy.eval()
        x = torch.zeros(self.env.observation_space.n)
        x[state] = 1
        x = self.forward_(x)
        action = torch.argmax(x).item()
        return action
       
        
    def forward(self, x):
        x = self.forward_(x)
        return x

In [18]:
@torch.enable_grad()
def update(agent, dataloader, loss, opt):
    agent.policy.train()
    errs = []
    for x, y in dataloader:
        y_hat = agent.forward(x)
        err = loss(y_hat, y)
        errs.append(err)
        
        opt.zero_grad()
        err.backward()
        opt.step()
        
    return errs


@torch.no_grad()
def evaluate(agent, dataloader, loss):
    agent.policy.eval()
    errs = []
    for x, y in dataloader:
        y_hat = agent.forward(x)
        #y = torch.tensor(y, dtype=torch.long, requires_grad=True)
        err = loss(y_hat, y)
        errs.append(err)
        
    return errs

In [19]:
def sample_trajectory(env, expert):
    state = env.reset()
    done = False
    states = []
    actions = []

    while not done:
        action = expert.take_action(state)
        states.append(state)
        actions.append(action)
        state, reward, done, info = env.step(action)

    return states, actions


def sample_trajectories(env, expert, nr):
    dataset = []
    for i in range(nr):
        s, a = sample_trajectory(env, expert)
        dataset += list(zip(s, a))
    return dataset


class ExpertDataset(torch.utils.data.Dataset):
    def __init__(self, env, expert, nr):
        self.env = env
        self.expert = expert
        self.nr = nr
        self.data = sample_trajectories(env, expert, nr)
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        x, y = self.data[idx]
        x_ = torch.zeros(self.env.observation_space.n)
        x_[x] = 1.
        y = torch.tensor(y)
        return x_, y

In [20]:
# without learning the network parameters, random behaviour
bc_agent = BCAgent(env)

In [21]:
n_steps, n_fails, G, captured_frames = run_agent(env, bc_agent, episodes=1)

In [22]:
animate_agent(frames=captured_frames, verbose=True, delay=0.1)

  (Right)
SFFFHFFHFFFH
FFFFFFFFFFFF
FFFFFFFFFFFF
FFFFFFHFFFFH
HFFFFFFFFFFF
F[41mH[0mFFFFHFFFFF
FFFFFFFFFFFF
HFHFFHFFFFFH
FFFFFFFHFFFF
FFFFFFFFFFFF
HFFFFFFFFFFH
FFFFHFFFFFFG

Step No.: 14
State ID: 61
Action ID: 2
Reward: 0.0


In [23]:
train_data = ExpertDataset(env, q_agent, 1000)
train_size = int(0.8 * len(train_data))
val_size = len(train_data) - train_size
print(f"Size train set: {train_size}, Size train val: {val_size}")
train_set, val_set = torch.utils.data.random_split(train_data, [train_size, val_size])

Size train set: 97974, Size train val: 24494


In [24]:
batch_size = 32
train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_set, batch_size=batch_size)

In [25]:
opt = torch.optim.Adam(bc_agent.policy.parameters(), lr=1e-3, weight_decay=1e-7)
loss = torch.nn.CrossEntropyLoss()
n_epochs = 5
for e in range(n_epochs):
    err = update(bc_agent, train_loader, loss, opt)
    print(f"Epoch: {e + 1}")
    print(f"Train Err: {sum(err) / len(err)}")
    
    err = evaluate(bc_agent, val_loader, loss)
    print(f"Val Err: {sum(err) / len(err)}")

Epoch: 1
Train Err: 0.013957426883280277
Val Err: 2.3617990336788353e-06
Epoch: 2
Train Err: 4.369762336864369e-06
Val Err: 2.4191066927414795e-07
Epoch: 3
Train Err: 9.62570538831642e-07
Val Err: 6.919035655528205e-08
Epoch: 4
Train Err: 5.858806275682582e-07
Val Err: 3.423486560905076e-08
Epoch: 5
Train Err: 4.213874262859463e-07
Val Err: 4.820430277163723e-08


In [26]:
n_steps, n_fails, G, captured_frames = run_agent(env, bc_agent, episodes=3)

In [27]:
animate_agent(frames=captured_frames, verbose=True, delay=0.1)

  (Up)
SFFFHFFHFFFH
FFFFFFFFFFFF
FFFFFFFFFFFF
FFFFFFHFFFFH
HFFFFFFFFFFF
FHFFFFHFFFFF
FFFFFFFFFFFF
HFHFFHFFFFFH
FFFFFFFHFFFF
FFFFFFFFFFFF
HFFFFFFFFFFH
FFFFHFFFFFF[41mG[0m

Step No.: 322
State ID: 143
Action ID: 3
Reward: 1.0
