In [1]:
import numpy as np 
import time 
import numpy as np
import torch
import torch.nn as nn
import time
import gym 
import pickle 
from replay_memory import Memory
import argparse
import torch.nn.functional as F
from torch.distributions import Normal
from torch.distributions.transformed_distribution import TransformedDistribution
from torch.distributions.transforms import TanhTransform
from torch.optim import Adam
import tqdm

from torch.utils.data import DataLoader 

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

  return torch._C._cuda_getDeviceCount() > 0


device(type='cpu')

In [3]:
env_name='HalfCheetah-v3'
data_path = "expert_data/HalfCheetah-v3_10_9498.pkl" 

In [4]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

data_good=data_good[:20]

good_obs=[]
good_acts=[]
good_rews=[]

for traj in data_good: 
    s,a,r=traj  

    good_obs.append(s)
    good_acts.append(a)
    good_rews.append(r)

good_obs=np.vstack(good_obs)
good_acts=np.vstack(good_acts)
good_rews=np.vstack(good_rews) 

good_obs.shape, good_acts.shape, good_rews.shape

expert data loaded


((10000, 17), (10000, 6), (10000, 1))

In [5]:
data_loader = DataLoader( list(zip(good_obs, good_acts)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 17]), torch.Size([64, 6]))

In [6]:
action_dim=actions.shape[1]
state_dim=states.shape[1]
print(state_dim, action_dim)

17 6


In [7]:
class MLP(nn.Module):
    def __init__(self, input_dim, size=32):
        super(MLP, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim,size),
            nn.ReLU(), 
            nn.Linear(size,size),
            nn.ReLU(),
            nn.Linear(size,size),
            nn.ReLU() 
        )
    def forward(self,x):
        x = self.net(x)
        return x
 
class GaussianPolicy(MLP):
    def __init__(self, input_dim, output_dim, hidden_size=64):
        super(GaussianPolicy, self).__init__(input_dim, hidden_size) 
        self.mean = nn.Linear(hidden_size, output_dim) 
        self.log_std_layer = nn.Linear(hidden_size, output_dim)

    def forward(self, state):
        x = self.net(state)
        mean = self.mean(x) 
        log_std = self.log_std_layer(x)  # Predict log_std using a linear layer
        std = torch.exp(log_std)

        return mean, std

In [8]:
learning_rate = 1e-4

policy = GaussianPolicy(state_dim, action_dim, 64)
optimizer = Adam(policy.parameters(), lr=learning_rate)

In [9]:
num_epochs=4_000
for epoch in range(num_epochs): 
    total_loss=0
    b=0
    for states, actions in data_loader: 
        means, stds = policy(states.float())
        dist = Normal(means, stds)
        log_probs = dist.log_prob(actions).sum(dim=-1)
        loss = -log_probs.mean()

        total_loss += loss.item() 
        b=b+1

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if epoch % (num_epochs//20)==0:
        print(f'Epoch {epoch} Loss: {total_loss/b:.3f}')


Epoch 0 Loss: 5.906
Epoch 200 Loss: -8.794
Epoch 400 Loss: -10.227
Epoch 600 Loss: -11.098
Epoch 800 Loss: -11.776
Epoch 1000 Loss: -12.245
Epoch 1200 Loss: -12.730
Epoch 1400 Loss: -13.180
Epoch 1600 Loss: -13.477
Epoch 1800 Loss: -13.851
Epoch 2000 Loss: -14.055
Epoch 2200 Loss: -14.189
Epoch 2400 Loss: -14.478
Epoch 2600 Loss: -14.837
Epoch 2800 Loss: -14.890
Epoch 3000 Loss: -15.071
Epoch 3200 Loss: -15.190
Epoch 3400 Loss: -15.395
Epoch 3600 Loss: -15.508
Epoch 3800 Loss: -15.683


In [10]:
env = gym.make(env_name)
obs,info = env.reset()

  logger.warn(
  logger.warn(


In [11]:
def play(env, policy, is_close=True, is_render=True, max_step=1000): 
    obs,info = env.reset()
    dones=False
    total_r=0
    step=0
    while not dones: 
        step+=1
        states=torch.Tensor(obs[None]).to(device)
        means, stds = policy(states.float())
        action = means.detach().cpu().numpy()[0]

        obs, rewards, done, s, info = env.step(action)
        total_r +=rewards  
        if done:
            break
        if step>max_step:
            # print('max step reached')
            break
        # elif s:
        #     print('solved!')
        #     break
    if is_close:
        env.close()
    return {'reward':total_r, 'step':step-1}

In [12]:
env = gym.make(env_name)
play(env, policy, is_close=True, is_render=False)

{'reward': 139.33717097829347, 'step': 1000}

In [13]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play(env, policy, is_close=True, is_render=False)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: 737.12
episode #1 reward: 148.10
episode #2 reward: -1145.00
episode #3 reward: 716.50
episode #4 reward: -896.93
episode #5 reward: -1313.66
episode #6 reward: -559.43
episode #7 reward: -1220.03
episode #8 reward: -361.22
episode #9 reward: 341.29
episode #10 reward: -1017.39
episode #11 reward: 599.46
episode #12 reward: -24.44
episode #13 reward: -1100.42
episode #14 reward: -910.69
episode #15 reward: -917.96
episode #16 reward: 610.78
episode #17 reward: -1107.15
episode #18 reward: 132.76
episode #19 reward: -386.84

 score: -383.76 +- 709.47
