In [50]:
import gym
import minihack
import matplotlib.pyplot as plt
import os
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import deque
import time
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
import random


# TODO
1. Improvements to model/algorithm (maybe just include baseline? Or change to actor-critic)
1. Hyperparameter Tuning

1 colab notebook per experiment, with a folder dedicated to that experiment. 5 of each in said folder:
1. saved policy model
1. saved gif (2: best gif, last gif)
1. saved scores

# Useful URLs

https://minihack.readthedocs.io/en/latest/index.html

https://minihack.readthedocs.io/en/latest/getting-started/observation_spaces.html#options

https://discuss.pytorch.org/t/pytorch-multiple-input-and-output/140020/2

https://goodboychan.github.io/python/reinforcement_learning/pytorch/udacity/2021/05/12/REINFORCE-CartPole.html

# Utility Functions


In [2]:
def get_shapes(observation):
    '''
    Args: 
        observation: (dict) An environment observation
    Ret:
        shapes: (list) A list of the shapes of the numpy arrays in the observation dict
    '''
    shapes = []
    for key in observation.keys():
        if not key == "pixel":
            thing = observation[key]
            shapes.append(thing.shape)
    return shapes

def save_gif(gif,path=None):
    '''
    Args:
        gif: a list of image objects
        path: the path to save the gif to, defaults to <current datetime>.gif
    '''
    if path is None:
        path = '/home/evan/RL_Assignment/repo/video'+datetime.now().strftime("%d-%m-%Y_%H:%M:%S") + '.gif'
    gif[0].save(path, save_all=True,optimize=False, append_images=gif[1:], loop=0)
    print("Saved Video")

def save_policy_model(policy_model,PATH):
    torch.save(policy_model.state_dict(), PATH)

def load_policy_model(model,PATH):
    model.load_state_dict(torch.load(PATH))
    return model

def save_scores(scores,PATH):
    array = np.array(scores)
    np.savetxt(PATH+"scores.txt", array)


# Init Env

In [48]:
device = torch.device("cpu")
# obs_keys = ('glyphs', 'chars','screen_descriptions', 'glyphs_crop', 'chars_crop', 'screen_descriptions_crop','blstats', 'message', 'inv_strs', 'pixel')
obs_keys = ('glyphs','glyphs_crop','inv_glyphs', 'pixel')

env = gym.make(
    "MiniHack-Quest-Hard-v0",
    observation_keys=obs_keys
)
    

# Policy

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [42]:
class CropPolicy(nn.Module):
    def __init__(self, env, hidden_size=100):
        super(CropPolicy, self).__init__()

        # initialising state
        state = env.reset()
        action_size = env.action_space.n

        # Getting the shapes of the state space
        shapes = get_shapes(state)
        self.obs_keys = obs_keys

        input_size = 136 # self.get_input_shape(shapes)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, action_size)
    
    def get_input_shape(self,shapes):
        length = 0
        for shape in shapes:
            length +=sum(shape)
        return length

    def forward(self, state):
        '''
        Args: state, an observation from env.step(). 
            state is a dictionary, with keys "glyphs" and "inv_glyphs"
        Ret: action probabilites

        '''
        
        glyph_features = torch.from_numpy(state['glyphs_crop']).float().to(device)
        glyph_features = torch.flatten(glyph_features)
        inv_features = torch.from_numpy(state['inv_glyphs']).to(device)
        x = torch.cat((glyph_features.float(),inv_features.float()))
        
        x = torch.tanh(self.fc1(x))
        action_probabilities = torch.tanh(self.fc2(x)).float()
        # print(action_probabilities)
        # we just consider 1 dimensional probability of action
        return F.softmax(action_probabilities,dim=0)
    
    def act(self, state):
        '''
        Args: state, an observation from env.step(). 
            state is a dictionary, with keys "glyphs" and "inv_glyphs"
        Ret: action,probability(action|policy,state)

        '''
        probs = self.forward(state).cpu()
        # print(probs)
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)

In [49]:
class FullViewPolicy(nn.Module):
    def __init__(self, env, hidden_size=100):
        super(FullViewPolicy, self).__init__()

        # initialising state
        state = env.reset()
        self.glyphState = np.array([
            state['glyphs'],
            state['glyphs'],
            state['glyphs']
        ])
        action_size = env.action_space.n

        # Getting the shapes of the state space
        shapes = get_shapes(state)
        self.obs_keys = obs_keys

        self.glyphConv2D1 = nn.Conv2d(3,1,(3,5),stride=(2,3))
        self.glyphMaxPool1 = nn.MaxPool2d(2, stride=2)

        self.fc1 = nn.Linear(115, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, action_size)
        
    def forward(self, state):
        '''
        Args: state, an observation from env.step(). 
            state is a dictionary, with keys "glyphs" and "inv_glyphs"
        Ret: action probabilites

        '''
        self.glyphState = np.insert(self.glyphState[:-1,:,:],0,state['glyphs'],axis=0)
        
        glyph_features = torch.from_numpy(self.glyphState ).float().unsqueeze(0).to(device)
        glyph_features = F.relu(self.glyphConv2D1(glyph_features))
        glyph_features = torch.flatten(self.glyphMaxPool1(glyph_features))
        inv_features = torch.from_numpy(state['inv_glyphs']).to(device)
        x = torch.cat((glyph_features.float(),inv_features.float()))
        
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        action_probabilites = torch.tanh(self.fc3(x))
        # we just consider 1 dimensional probability of action
        return F.softmax(action_probabilites, dim=-1)
    
    def act(self, state):
        '''
        Args: state, an observation from env.step(). 
            state is a dictionary, with keys "glyphs" and "inv_glyphs"
        Ret: action,probability(action|policy,state)

        '''
        probs = self.forward(state).cpu()
        model = Categorical(probs)
        action = model.sample()
        return action.item(), model.log_prob(action)

# REINFORCE

In [43]:

def compute_returns(rewards, gamma):
    return np.dot(rewards,np.power(np.ones(len(rewards))*gamma,np.arange(len(rewards))))
    raise NotImplementedError
    return returns


def compute_returns_naive_baseline(rewards, gamma):
    # raise NotImplementedError
    mean = np.mean(rewards)
    std = np.std(rewards)
    returns =  np.multiply(rewards,np.power(np.ones(len(rewards))*gamma,np.arange(len(rewards))))
    

    return returns,mean,std# (sum(returns)-mean)/std


def reinforce(policy, optimizer, seed, n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    '''
    Args:
        policy: a pytorch model, takes in a state and outputs an action
        optimizer: a pytorch optimizer
        n_episodes: (int) number of episodes to train for
        max_t: (int) max time steps per episode
        gamma: (float, [0,1]) discount factor
        print_every: (int) number of episodes between print of update
    
    Ret:
        scores: list of total rewards per episode
        gif: a list of images corresponding to the best scoring episode
    '''
    print("Starting Reinforce")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    best_frames = None
    latest_frames=None
    best_score = None
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(1, n_episodes):
        frames = []
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            # Sample the action from current policy
            action, log_prob = policy.act(state)
            # action = np.random.randint(0, high=85)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action) 
            frame = state["pixel"]
            frames.append(frame)
            rewards.append(reward)
            if done:
                break
        # Calculate total expected reward
        scores_deque.append(sum(rewards))
        score = sum(rewards)
        scores.append(score)

        # Keep track of the gif of the best scoring episode
        latest_frames = frames
        if best_score is None or score > best_score: 
            best_score=score
            best_frames = frames
        
        # Recalculate the total reward applying discounted factor
        discounts = [gamma ** i for i in range(len(rewards) + 1)]
        R = sum([a * b for a,b in zip(discounts, rewards)])
        
        # Calculate the loss 
        policy_loss = []
        for log_prob in saved_log_probs:
            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
            policy_loss.append(-log_prob * R)
        # After that, we concatenate whole policy loss in 0th dimension
        # print(policy_loss[0])
        policy_loss = torch.stack(policy_loss).sum()
        
        # Backpropagation
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(e - 100, np.mean(scores_deque)))
            break

    gif = []
    for image in best_frames:
        gif.append(Image.fromarray(image, "RGB"))
    last_gif =[]
    for image in latest_frames:
        last_gif.append(Image.fromarray(image, "RGB"))
    print("Done!")

    return scores,gif,last_gif


def reinforce_naive_baseline(policy, optimizer, seed,n_episodes=1000, max_t=1000, gamma=1.0, print_every=100):
    '''
    Args:
        policy: a pytorch model, takes in a state and outputs an action
        optimizer: a pytorch optimizer
        n_episodes: (int) number of episodes to train for
        max_t: (int) max time steps per episode
        gamma: (float, [0,1]) discount factor
        print_every: (int) number of episodes between print of update
    
    Ret:
        scores: list of total rewards per episode
        gif: a list of images corresponding to the best scoring episode
    '''
    print("Starting Reinforce")

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    env.seed(seed)

    best_frames = None
    latest_frames=None
    best_score = None
    scores_deque = deque(maxlen=100)
    scores = []
    for e in range(1, n_episodes):
        frames = []
        saved_log_probs = []
        rewards = []
        state = env.reset()
        # Collect trajectory
        for t in range(max_t):
            # Sample the action from current policy
            action, log_prob = policy.act(state)
            # action = np.random.randint(0, high=85)
            saved_log_probs.append(log_prob)
            state, reward, done, _ = env.step(action) 
            frame = state["pixel"]
            frames.append(frame)
            rewards.append(reward)
            if done:
                break
        # Calculate total expected reward
        scores_deque.append(sum(rewards))
        score = sum(rewards)
        scores.append(score)

        # Keep track of the gif of the best scoring episode
        latest_frames = frames
        if best_score is None or score > best_score: 
            best_score=score
            best_frames = frames
        
        # Recalculate the total reward applying discounted factor
        # discounts = [gamma ** i for i in range(len(rewards) + 1)]
        G,G_mean,G_std = compute_returns_naive_baseline(rewards, gamma)

        # R = sum([a * b for a,b in zip(discounts, rewards)])
        
        # Calculate the loss 
        policy_loss = []
        for i in range(len(saved_log_probs)):
            # Note that we are using Gradient Ascent, not Descent. So we need to calculate it with negative rewards.
            log_prob = saved_log_probs[i]
            denominator = max(G_std,1e-2) 
            policy_loss.append(-log_prob * (sum(G[i:])-G_mean)/denominator)
        # After that, we concatenate whole policy loss in 0th dimension
        # print(policy_loss[0])
        policy_loss = torch.stack(policy_loss).sum()
        
        # Backpropagation
        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()
        
        if e % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(e, np.mean(scores_deque)))
        if np.mean(scores_deque) >= 195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(e - 100, np.mean(scores_deque)))
            break

    gif = []
    for image in best_frames:
        gif.append(Image.fromarray(image, "RGB"))
    last_gif =[]
    for image in latest_frames:
        last_gif.append(Image.fromarray(image, "RGB"))
    print("Done!")

    return scores,gif,last_gif

# Run

In [None]:

seeds = np.random.randint(1000, size=5)
iteration = 0
for seed in seeds:
    policy = CropPolicy(env).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    scores,gif,last_gif = reinforce(policy, optimizer,int(seed), n_episodes=1000,max_t=10000,print_every=5)
    PATH = str(iteration)+'.pth' # TODO path
    save_policy_model(policy,PATH) # TODO path
    save_gif(gif) # TODO path
    save_gif(last_gif) # TODO path
    save_scores(scores,PATH) # TODO path
    iteration+=1

In [None]:

seeds = np.random.randint(1000, size=5)
iteration = 0
for seed in seeds:
    policy = CropPolicy(env).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    scores,gif,last_gif = reinforce_naive_baseline(policy, optimizer,int(seed), n_episodes=1000,max_t=10000,print_every=5)
    PATH = str(iteration)+'.pth' # TODO path
    save_policy_model(policy,PATH) # TODO path
    save_gif(gif) # TODO path
    save_gif(last_gif) # TODO path
    save_scores(scores,PATH) # TODO path
    iteration+=1

In [None]:

seeds = np.random.randint(1000, size=5)
iteration = 0
for seed in seeds:
    policy = FullViewPolicy(env).to(device)
    optimizer = optim.Adam(policy.parameters(), lr=1e-2)
    scores,gif,last_gif = reinforce_naive_baseline(policy, optimizer,int(seed), n_episodes=1000,max_t=10000,print_every=5)
    PATH = str(iteration)+'.pth' # TODO path
    save_policy_model(policy,PATH) # TODO path
    save_gif(gif) # TODO path
    save_gif(last_gif) # TODO path
    save_scores(scores,PATH) # TODO path
    iteration+=1

In [46]:
save_gif(gif)
save_gif(last_gif)

Saved Video
Saved Video


In [44]:
import pathlib
pathlib.Path().resolve()

PosixPath('/home/evan/RL_Assignment/repo')