In [1]:
import numpy as np 
import time 
import numpy as np
import torch
import torch.nn as nn
import time
import gym 
import pickle  
import argparse
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.transformed_distribution import TransformedDistribution
from torch.distributions.transforms import TanhTransform
from torch.optim import Adam
import tqdm

from torch.utils.data import DataLoader 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [3]:
env_name='Ant-v3'
data_path = "/home/ns/bc_tutorial/mujoco/expert_data/Ant-v3_10_3765.pkl"

In [4]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

data_good=data_good[:20]

good_obs=[]
good_acts=[]
good_rews=[]

for traj in data_good: 
    s,a,r=traj  

    good_obs.append(s)
    good_acts.append(a)
    good_rews.append(r)

good_obs=np.vstack(good_obs)
good_acts=np.vstack(good_acts)
good_rews=np.vstack(good_rews) 

good_obs.shape, good_acts.shape, good_rews.shape

expert data loaded


((10000, 111), (10000, 8), (10000, 1))

In [5]:
data_loader = DataLoader( list(zip(good_obs, good_acts)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 111]), torch.Size([64, 8]))

In [6]:
action_dim=actions.shape[1]
state_dim=states.shape[1]
print(state_dim, action_dim)

111 8


In [7]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
 
class GaussianPolicy(MLP):
    def __init__(self, input_dim, output_dim, hidden_size=64):
        super(GaussianPolicy, self).__init__(input_dim, hidden_size) 
        self.mean = nn.Linear(hidden_size, output_dim) 
        self.log_std_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, state):
        x = self.net(state)
        mean = self.mean(x) 
        log_std = self.log_std_layer(x)  # Predict log_std using a linear layer
        std = torch.exp(log_std)

        return mean, std

In [8]:
learning_rate = 1e-4

policy = GaussianPolicy(state_dim, action_dim, 64)
optimizer = Adam(policy.parameters(), lr=learning_rate)

In [9]:
num_epochs=1_000
for epoch in range(num_epochs): 
    total_loss=0
    b=0
    for states, actions in data_loader: 
        means, stds = policy(states.float())
        dist = Normal(means, stds)
        log_probs = dist.log_prob(actions).sum(dim=-1)
        loss = -log_probs.mean()

        total_loss += loss.item() 
        b=b+1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % (num_epochs//20)==0:
        print(f'Epoch {epoch} Loss: {total_loss/b:.3f}')


Epoch 0 Loss: 5.723
Epoch 50 Loss: -8.211
Epoch 100 Loss: -9.187
Epoch 150 Loss: -9.694
Epoch 200 Loss: -10.009
Epoch 250 Loss: -10.226
Epoch 300 Loss: -10.415
Epoch 350 Loss: -10.564
Epoch 400 Loss: -10.686
Epoch 450 Loss: -10.789
Epoch 500 Loss: -10.881
Epoch 550 Loss: -10.978
Epoch 600 Loss: -11.039
Epoch 650 Loss: -11.108
Epoch 700 Loss: -11.171
Epoch 750 Loss: -11.241
Epoch 800 Loss: -11.302
Epoch 850 Loss: -11.321
Epoch 900 Loss: -11.374
Epoch 950 Loss: -11.428


In [10]:
env = gym.make(env_name)
obs,info = env.reset()

  logger.warn(
  logger.warn(


In [13]:
states=torch.Tensor(obs[None]).to(device)
means, stds = policy(states.float())
means = means.detach().cpu().numpy()[0]
means

array([ 0.23851806, -0.00702067,  0.14628345, -0.16866887, -0.28914148,
        0.5216701 ,  0.49675786,  0.39905754], dtype=float32)

In [14]:
def play(env, policy, is_close=True, is_render=True, max_step=1000): 
    obs,info = env.reset()
    dones=False
    total_r=0
    step=0
    while not dones: 
        step+=1
        states=torch.Tensor(obs[None]).to(device)
        means, stds = policy(states.float())
        action = means.detach().cpu().numpy()[0]

        obs, rewards, done, s, info = env.step(action)
        total_r +=rewards  
        if done:
            break
        if step>max_step:
            # print('max step reached')
            break
        # elif s:
        #     print('solved!')
        #     break
    if is_close:
        env.close()
    return {'reward':total_r, 'step':step-1}

In [15]:
env = gym.make(env_name)
play(env, policy, is_close=True, is_render=False)

  logger.warn(
  logger.warn(


{'reward': 4024.132919545349, 'step': 1000}

In [16]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play(env, policy, is_close=True, is_render=False)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: 4197.10
episode #1 reward: 3882.40
episode #2 reward: 3827.77
episode #3 reward: 4166.81
episode #4 reward: 4138.98
episode #5 reward: 4141.73
episode #6 reward: 4120.49
episode #7 reward: 4196.64
episode #8 reward: 4170.13
episode #9 reward: 3942.84
episode #10 reward: 4111.22
episode #11 reward: 3999.71
episode #12 reward: 4050.97
episode #13 reward: 4143.10
episode #14 reward: 4035.59
episode #15 reward: 4156.98
episode #16 reward: 4205.78
episode #17 reward: 4093.63
episode #18 reward: 3919.64
episode #19 reward: 4166.09

 score: 4083.38 +- 110.10
