# Demo of the MAGICAL benchmark suite for robust IL

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/qxcv/magical/blob/pyglet1.5/demo-notebook.ipynb)

This self-contained Colab notebook shows how to train a simple imitation learning agent on MAGICAL using behavioural cloning (BC).

## Setup code

This does a few things:

- Installs `xvfb` so that MAGICAL has access to an X server.
- Installs all the Python dependencies for MAGICAL, as well as a copy of the [imitation](https://github.com/HumanCompatibleAI/imitation) library.
- Downloads demonstrations for MAGICAL.

These setup steps will take a few minutes complete.

In [None]:
# Install MAGICAL, Xvfb, and a prerelease version of the 'imitation' library (https://github.com/HumanCompatibleAI/imitation)
!sudo DEBIAN_FRONTEND=noninteractive apt-get install -yq xvfb
# The pip install commands can give errors of the form "package W requires version X of package Y, but you'll have
# version Z which is incompatible". You can safely ignore those errors; I suspect they are conflicts in the default
# Colab environment.
!pip uninstall -qy torch torchvision
!pip install -q torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
!pip install -q 'magical-il' 'scikit-video~=1.1.11' 'xvfbwrapper~=0.2.9' 'git+git://github.com/HumanCompatibleAI/imitation@556f5d8384d99fa5ab8bc54a9828887a2db8c669#egg=imitation'
if 'vdisplay' not in globals():
    # start a virtual X display for MAGICAL rendering
    import xvfbwrapper
    vdisplay = xvfbwrapper.Xvfb()
    vdisplay.start()

In [None]:
import base64
import glob
import logging
import os
import tempfile

import gym
from imitation.algorithms.bc import BC
import imitation.augment as il_augment
from imitation.data import rollout
import imitation.data.types as il_types
from imitation.util.util import make_vec_env
from IPython import display
import numpy as np
import skvideo.io as vidio
import stable_baselines3.common.policies as sb3_pols
import torch
from torch import nn
import torch.utils.data as th_data

import magical
from magical.evaluation import EvaluationProtocol

magical.register_envs()
logging.basicConfig(level=logging.INFO)
# download trajectories
magical.try_download_demos(dest="demos")

In [None]:
class MAGICALNet(nn.Module):
    """Custom CNN for MAGICAL policies."""
    def __init__(self, observation_space, out_chans=256, width=2):
        super().__init__()
        w = width
        def conv_block(i, o, k, s, p, b=False):
            return [
                # batch norm has its own bias, so don't add one to conv layers by default
                nn.Conv2d(i, o, kernel_size=k, stride=s, padding=p, bias=b,
                          padding_mode='zeros'),
                nn.ReLU(),
                nn.BatchNorm2d(o)
            ]
        conv_layers = [
            *conv_block(i=observation_space.shape[0], o=32*w, k=5, s=1, p=2, b=True),
            *conv_block(i=32*w, o=64*w, k=3, s=2, p=1),
            *conv_block(i=64*w, o=64*w, k=3, s=2, p=1),
            *conv_block(i=64*w, o=64*w, k=3, s=2, p=1),
            *conv_block(i=64*w, o=64*w, k=3, s=2, p=1),
        ]
        # final FC layer to make feature maps the right size
        test_tensor = torch.zeros((1,) + observation_space.shape)
        for layer in conv_layers:
            test_tensor = layer(test_tensor)
        fc_in_size = np.prod(test_tensor.shape)
        reduction_layers = [
            nn.Flatten(),
            nn.Linear(fc_in_size, out_chans),
            # Stable Baselines will add extra affine layer on top of this reLU
            nn.ReLU(),
        ]
        self.features_dim = out_chans
        all_layers = [*conv_layers, *reduction_layers]
        self.feature_generator = nn.Sequential(*all_layers)

    def forward(self, x, traj_info=None):
        return self.feature_generator(x)

class ImitationEvaluationProtocol(EvaluationProtocol):
    """EvaluationProtocol is an abstract base class which is able to evaluate a MAGICAL policy on a set of test
    environments & appropriate calculate confidence intervals & other statistics for the mean score in each environment.
    Concrete instances of EvaluationProtocol must provide their own method for generating trajectories, and also provide
    a name for the resulting evaluation data (which will be written into the Pandas dataframe used to compute
    statistics).

    This subclass of EvaluationProtocol uses the `imitation` library to generate the require trajectories."""
    def __init__(self, policy, run_description, **kwargs):
        super().__init__(**kwargs)
        self.policy = policy
        self.run_description = run_description

    @property
    def run_id(self):
        # simple string describing this run
        return self.run_description

    def obtain_scores(self, env_name):
        print(f"Sampling {self.n_rollouts} trajectories on {env_name}")
        vec_env = make_vec_env(env_name=env_name, n_envs=min(25, self.n_rollouts))  # sample in parallel
        trajectories = rollout.generate_trajectories(self.policy,
                                                     vec_env,
                                                     sample_until=rollout.min_episodes(self.n_rollouts),
                                                     deterministic_policy=False)
        # the MAGICAL score is passed through the final info dict in each trajectory
        scores = [traj.infos[-1]['eval_score'] for traj in trajectories]
        return scores

def create_policy_video(policy, demo_env_name, traj_per_env=1, fps=24):
    """Create a video showing policy performance on the demo environment and all test environments."""
    with tempfile.NamedTemporaryFile(suffix=".mp4") as fp:
        writer = vidio.FFmpegWriter(fp.name, outputdict={'-r': str(fps), '-vcodec': 'libx264', '-pix_fmt': 'yuv420p'})

        # for both demo environment + test environments, we append `traj_per_env` demos to the video
        env_name_list = (demo_env_name, ) + magical.DEMO_ENVS_TO_TEST_ENVS_MAP[demo_env_name]
        for env_name in env_name_list:
            vec_env = make_vec_env(env_name=env_name, n_envs=min(traj_per_env, 25))
            trajectories = rollout.generate_trajectories(policy, vec_env,
                                                        sample_until=rollout.min_episodes(traj_per_env),)
            vec_env.close()
            for traj in trajectories:
                for obs in traj.obs:
                    # each observation is a frame stack; we write only the last (RGB) frame, transposed to be channels-last
                    rgb_frame = np.transpose(obs[-3:], (1, 2, 0))
                    vid_h, vid_w = rgb_frame.shape[:2]
                    writer.writeFrame(rgb_frame)

        # finish writing video
        writer.close()

        # now convert video to base64 so we can generate a <video> tag that works with the notebook
        vid_base64 = base64.b64encode(fp.read()).decode('utf-8')
        print('Video size (MB):', len(vid_base64) / 1e6)
        html_string = f"""<video width="{vid_w}" height="{vid_h}" muted controls loop autoplay>
            <source src="data:video/mp4;base64,{vid_base64}" type="video/mp4">
            No &lt;video&gt; tag support :(
        </video>"""
        return display.HTML(data=html_string)

## Running MAGICAL

In [None]:
env_ident = 'MoveToCorner'
preproc_name = 'LoResCHW4E'

In [None]:
demo_paths_by_env = {
    'MoveToCorner': glob.glob('demos/move-to-corner/demo-*.pkl.gz'),
}
demo_paths = demo_paths_by_env[env_ident]
# Gym env name with preprocessor
env_name = f'{env_ident}-Demo-{preproc_name}-v0'

In [None]:
env = gym.make(env_name)
demo_dicts = magical.load_demos(demo_paths[:10])
demo_trajs = []
orig_env_name = None  # we will read this from the demos dicts
for demo_dict in demo_dicts:
    # each demo dict has keys ['trajectory', 'score', 'env_name']
    # (trajectory contains the actual data, and score is generally 1.0 for demonstrations)
    orig_env_name = demo_dict['env_name']
    demo_trajs.append(demo_dict['trajectory'])
demo_trajs_preproc = magical.preprocess_demos_with_wrapper(demo_trajs, orig_env_name, preproc_name=preproc_name)

In [None]:
# Build dataset in the format required by imitation. Note that traj.obs contains the final observation after the last
# action, so we drop the last observation when concatenating trajectories.
all_obs = np.concatenate([traj.obs[:-1] for traj in demo_trajs_preproc], axis=0)
all_acts = np.concatenate([traj.acts for traj in demo_trajs_preproc], axis=0)
dataset = il_types.TransitionsMinimal(obs=all_obs, acts=all_acts, infos=[{}] * len(all_obs))
data_loader = th_data.DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=il_types.transitions_collate_fn)
augmenter = il_augment.StandardAugmentations.from_string_spec(
       'rotate,translate,noise', stack_color_space=il_augment.ColorSpace.RGB)
bc_trainer = BC(
    observation_space=env.observation_space,
    action_space=env.action_space,
    policy_class=sb3_pols.ActorCriticCnnPolicy,
    policy_kwargs=dict(features_extractor_class=MAGICALNet),
    expert_data=data_loader,
    augmentation_fn=augmenter,
    device='cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# try training for longer (e.g. 15,000 batches) to get better performance
bc_trainer.train(n_batches=500)

## Evaluating the policy and rendering a video

In [None]:
eval_protocol = ImitationEvaluationProtocol(
    policy=bc_trainer.policy,
    run_description=f"notebook-demo-{env_name}",
    demo_env_name=env_name,
    # number of rollouts per environment
    # (small so rollouts are fast)
    n_rollouts=15)
eval_result = eval_protocol.do_eval(verbose=True)
eval_result

In [None]:
video = create_policy_video(bc_trainer.policy, env_name)
display.display(video)