In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

### BC policy with negative log-likelihood loss

In [3]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
 
class GaussianPolicy(MLP):
    def __init__(self, input_dim, output_dim, hidden_size=64):
        super(GaussianPolicy, self).__init__(input_dim, hidden_size) 
        self.mean = nn.Linear(hidden_size, output_dim) 
        self.log_std_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, state):
        x = self.net(state)
        mean = self.mean(x) 
        log_std = self.log_std_layer(x)  # Predict log_std using a linear layer
        std = torch.exp(log_std)

        return mean, std

In [4]:
input_dim = 3
output_dim = 1  
hidden_size = 64
batch_size = 64
num_epochs = 1_000
learning_rate = 1e-4

# Initialize policy and optimizer
policy = GaussianPolicy(input_dim, output_dim, hidden_size)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [5]:
env_name='Pendulum-v1'
data_path="/home/ns/bc_tutorial/pendulum/expert_data/Pendulum-v1_10_-130.pkl" 
data_path = "expert_data/Pendulum-v1_50_-149.pkl"

In [6]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

data_good=data_good[:20]

good_obs=[]
good_acts=[] 
for traj in data_good: 
    s,a,r=traj   
    good_obs.append(s)
    good_acts.append(a) 

states=np.vstack(good_obs)
actions=np.vstack(good_acts)
print('X:',states.shape,' y:', actions.shape)

expert data loaded
X: (4000, 3)  y: (4000, 1)


In [7]:
data_loader = torch.utils.data.DataLoader( list(zip(states, actions)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 3]), torch.Size([64, 1]))

In [8]:
for epoch in range(num_epochs): 
    total_loss=0
    b=0
    for states, actions in data_loader: 
        
        # Forward pass
        means, stds = policy(states)
        dist = Normal(means, stds)
        log_probs = dist.log_prob(actions).sum(dim=-1)
        loss = -log_probs.mean()

        total_loss += loss.item() 
        b=b+1

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % (num_epochs//20)==0:
        print(f'Epoch {epoch} Loss: {total_loss/b:.3f}')


Epoch 0 Loss: 1.063
Epoch 50 Loss: 0.451
Epoch 100 Loss: -0.527
Epoch 150 Loss: -0.687
Epoch 200 Loss: -0.792
Epoch 250 Loss: -0.892
Epoch 300 Loss: -0.995
Epoch 350 Loss: -1.104
Epoch 400 Loss: -1.229
Epoch 450 Loss: -1.339
Epoch 500 Loss: -1.485
Epoch 550 Loss: -1.659
Epoch 600 Loss: -1.837
Epoch 650 Loss: -2.045
Epoch 700 Loss: -2.282
Epoch 750 Loss: -2.648
Epoch 800 Loss: -3.045
Epoch 850 Loss: -3.358
Epoch 900 Loss: -3.424
Epoch 950 Loss: -3.642


In [9]:
env = gym.make(env_name)
obs,info = env.reset()
obs

array([ 0.634646  ,  0.77280295, -0.6687829 ], dtype=float32)

In [10]:
def play(env, policy, is_close=True, is_render=True, max_step=1000): 
    obs,info = env.reset()
    dones=False
    total_r=0
    step=0
    while not dones: 
        step+=1
        states=torch.tensor(obs[None], dtype=torch.float)
        means, stds = policy(states) 
        action=means[0][0].detach().numpy() 

        obs, rewards, done, trunc, info = env.step([action] )
        total_r +=rewards  
        if done or trunc:
            break
    if is_close or is_render:
        env.close()
    return {'reward':total_r, 'step':step-1}

In [11]:
env=gym.make(env_name)
stats=play(env, policy, is_close=True, is_render=False)
stats

{'reward': -128.05186544935088, 'step': 199}

In [12]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play(env, policy, is_close=True, is_render=False)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: -122.40
episode #1 reward: -120.88
episode #2 reward: -1.65
episode #3 reward: -386.62
episode #4 reward: -117.18
episode #5 reward: -120.13
episode #6 reward: -357.32
episode #7 reward: -120.29
episode #8 reward: -120.05
episode #9 reward: -119.19
episode #10 reward: -122.21
episode #11 reward: -398.29
episode #12 reward: -124.01
episode #13 reward: -125.03
episode #14 reward: -128.00
episode #15 reward: -3.19
episode #16 reward: -1.62
episode #17 reward: -245.85
episode #18 reward: -120.76
episode #19 reward: -121.78

 score: -148.82 +- 110.88
