### installation for colab
To Run in Colab: uncomment and run the following

In [None]:
# !pip install mujoco

In [2]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np 
import time 
import numpy as np
import torch
import torch.nn as nn
import time
import gym 
import pickle  
import argparse
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.transformed_distribution import TransformedDistribution
from torch.distributions.transforms import TanhTransform
from torch.optim import Adam
import tqdm
import imageio
from torch.utils.data import DataLoader 

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
env_name='Ant-v3' 

In [5]:
# data_path = "expert_data/Ant-v3_10_3765.pkl"  
data_path = "expert_data/Ant-v3_10_5061.pkl"

with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

# data_good = data_good[:10]

expert data loaded


In [6]:
good_obs=[]
good_acts=[]

for traj in data_good: 
    s,a,r=traj  

    good_obs.append(s)
    good_acts.append(a)

good_obs=np.vstack(good_obs)
good_acts=np.vstack(good_acts)

In [7]:
good_obs.shape, good_acts.shape

((9731, 111), (9731, 8))

In [8]:
data_loader = DataLoader( list(zip(good_obs, good_acts)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 111]), torch.Size([64, 8]))

In [9]:
action_dim=actions.shape[1]
state_dim=states.shape[1]
print(state_dim, action_dim)

111 8


In [10]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
    
class RegNet(MLP):
    def __init__(self, input_dim , size, action_dim):
        super(RegNet, self).__init__(input_dim, size)
        self.decoder = nn.Linear(size, action_dim)
    def forward(self,x):
        x = self.net(x)
        x = self.decoder(x)
        return x

In [11]:
learning_rate = 1e-4

bc = RegNet(state_dim, 64, action_dim)
criterion = nn.MSELoss() 
optimizer = Adam(bc.parameters(), lr = learning_rate)

In [12]:
loss_list = []
test_loss = [] 
n_epoch = 1_000
 
for itr in range(0, n_epoch+1):
    total_loss = 0
    b=0
    for batch_states, batch_actions in data_loader: 
        y_pred = bc(batch_states.float())
        loss   = criterion(y_pred, batch_actions) 
        total_loss += loss.item() 
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        b += 1 
        
        loss_list.append(loss.item())
        
    if itr % (n_epoch//20)==0:
        print(f'Epoch {itr} Loss: {total_loss/b:.4f}')

Epoch 0 Loss: 0.2223
Epoch 50 Loss: 0.0422
Epoch 100 Loss: 0.0334
Epoch 150 Loss: 0.0286
Epoch 200 Loss: 0.0256
Epoch 250 Loss: 0.0238
Epoch 300 Loss: 0.0222
Epoch 350 Loss: 0.0210
Epoch 400 Loss: 0.0200
Epoch 450 Loss: 0.0192
Epoch 500 Loss: 0.0185
Epoch 550 Loss: 0.0179
Epoch 600 Loss: 0.0173
Epoch 650 Loss: 0.0169
Epoch 700 Loss: 0.0165
Epoch 750 Loss: 0.0163
Epoch 800 Loss: 0.0158
Epoch 850 Loss: 0.0155
Epoch 900 Loss: 0.0152
Epoch 950 Loss: 0.0151
Epoch 1000 Loss: 0.0147


In [15]:
def play_an_episode(env_name, policy, video_path=None, max_steps=1000):
    video_writer=None 
    if video_path is not None:
        print(f'Saving video to {video_path}')
        video_writer = imageio.get_writer(video_path, fps=20)
        env=gym.make(env_name, render_mode='rgb_array')
    else:
        env=gym.make(env_name)

    obs,_=env.reset()
    rewards=0
    step=0
    for _ in range(max_steps):
        step+=1
        obs=torch.Tensor(obs[None]) 
        ac = policy(obs)
        action=ac.cpu().detach().numpy() 

        obs, reward, done, trunc, info = env.step(action.ravel())

        if video_path is not None:
            image=env.render()
            video_writer.append_data(image)

        rewards+=reward
        if done or trunc:
            break

    if video_path is not None:
        video_writer.close()
    return {'reward':rewards, 'step':step-1}

In [16]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play_an_episode(env_name, bc)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: 4791.97
episode #1 reward: 4694.51
episode #2 reward: 5023.59
episode #3 reward: 3389.28
episode #4 reward: 5069.82
episode #5 reward: -147.55
episode #6 reward: 5018.38
episode #7 reward: 4931.14
episode #8 reward: 5141.59
episode #9 reward: 5136.07
episode #10 reward: 4430.07
episode #11 reward: 5182.24
episode #12 reward: 657.75
episode #13 reward: 4979.11
episode #14 reward: 990.90
episode #15 reward: 4859.99
episode #16 reward: -4251.33
episode #17 reward: 5356.54
episode #18 reward: 4914.20
episode #19 reward: 5000.95

 score: 3758.46 +- 2444.14


### Let's render and save a video using the learned policy.

In [17]:
stats=play_an_episode(env_name, bc, video_path='bc_ant.mp4')
stats 

Saving video to bc_ant.mp4
Found 3 GPUs for rendering. Using device 0.


{'reward': 5268.903145980556, 'step': 999}

In [18]:
from IPython.display import HTML
from base64 import b64encode

video_path = 'bc_ant.mp4'

mp4 = open(video_path, "rb").read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML(f"""
<video width=400 controls>
      <source src="{data_url}" type="video/mp4">
</video>
""")