In [1]:
import warnings
warnings.filterwarnings('ignore')

import pickle
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal
import tqdm

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:

 
class GaussianPolicy(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_size=64):
        super(GaussianPolicy, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.mean = nn.Linear(hidden_size, output_dim)
        self.log_std = nn.Parameter(-0.5 * torch.ones(output_dim))

    def forward(self, state):
        x = torch.relu(self.fc1(state))
        x = torch.relu(self.fc2(x))
        mean = self.mean(x)
        std = torch.exp(self.log_std)
        return mean, std

In [4]:
input_dim = 3
output_dim = 1  
hidden_size = 64
batch_size = 64
num_epochs = 100
learning_rate = 0.001

# Initialize policy and optimizer
policy = GaussianPolicy(input_dim, output_dim, hidden_size)
optimizer = optim.Adam(policy.parameters(), lr=learning_rate)

In [5]:
env_name='Pendulum-v1'
data_path="/home/ns/bc_tutorial/pendulum/expert_data/Pendulum-v1_10_-130.pkl" 
data_path = "expert_data/Pendulum-v1_50_-149.pkl"

In [6]:
with open(data_path, "rb") as f:
    data_good = pickle.load(f)
print('expert data loaded')

data_good=data_good[:20]

good_obs=[]
good_acts=[] 
for traj in data_good: 
    s,a,r=traj   
    good_obs.append(s)
    good_acts.append(a) 

states=np.vstack(good_obs)
actions=np.vstack(good_acts)
print('X:',states.shape,' y:', actions.shape)

expert data loaded
X: (4000, 3)  y: (4000, 1)


In [7]:
data_loader = torch.utils.data.DataLoader( list(zip(states, actions)), batch_size=64, shuffle=True)

batch=next(iter(data_loader))
states,actions = batch
states.shape,actions.shape

(torch.Size([64, 3]), torch.Size([64, 1]))

In [8]:
for epoch in range(num_epochs): 
    for states, actions in data_loader: 
        
        # Forward pass
        means, stds = policy(states)
        dist = Normal(means, stds)
        log_probs = dist.log_prob(actions).sum(dim=-1)
        loss = -log_probs.mean()

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item()}')

# Save trained policy
torch.save(policy.state_dict(), 'pendulum_policy.pth')


Epoch [1/100], Loss: 0.9916788339614868
Epoch [2/100], Loss: 0.8724008798599243
Epoch [3/100], Loss: 0.8098883032798767
Epoch [4/100], Loss: 0.9554415941238403
Epoch [5/100], Loss: 0.5279438495635986
Epoch [6/100], Loss: 0.5528560280799866
Epoch [7/100], Loss: 0.5995151400566101
Epoch [8/100], Loss: 0.8981122970581055
Epoch [9/100], Loss: 0.44481563568115234
Epoch [10/100], Loss: 0.41651296615600586
Epoch [11/100], Loss: 0.42773428559303284
Epoch [12/100], Loss: 0.35740163922309875
Epoch [13/100], Loss: 0.4800478219985962
Epoch [14/100], Loss: 0.30677199363708496
Epoch [15/100], Loss: 0.3473084270954132
Epoch [16/100], Loss: 1.3640999794006348
Epoch [17/100], Loss: 0.37510472536087036
Epoch [18/100], Loss: 0.31631189584732056
Epoch [19/100], Loss: 0.3687788248062134
Epoch [20/100], Loss: 0.24650098383426666
Epoch [21/100], Loss: 0.1555606722831726
Epoch [22/100], Loss: 0.5347149968147278
Epoch [23/100], Loss: 0.25805050134658813
Epoch [24/100], Loss: 0.32518959045410156
Epoch [25/100],

In [9]:
env = gym.make(env_name)
obs,info = env.reset()
obs

array([-0.99960434,  0.02812642, -0.7410221 ], dtype=float32)

In [10]:
states=torch.tensor(obs[None], dtype=torch.float)
states.shape

torch.Size([1, 3])

In [11]:
obs,info = env.reset()
dones=False
total_r=0
step=0
while not dones: 
    step+=1
    states=torch.tensor(obs[None], dtype=torch.float)
    means, stds = policy(states) 
    action=means[0][0].detach().numpy()

    obs, rewards, done, trunc, info = env.step([action] )
    total_r +=rewards  
    if done or trunc:
        break

In [12]:
total_r

-1.1492611795627912

In [13]:
def play(env, policy, is_close=True, is_render=True, max_step=1000): 
    obs,info = env.reset()
    dones=False
    total_r=0
    step=0
    while not dones: 
        step+=1
        states=torch.tensor(obs[None], dtype=torch.float)
        means, stds = policy(states) 
        action=means[0][0].detach().numpy() 

        obs, rewards, done, trunc, info = env.step([action] )
        total_r +=rewards  
        if done or trunc:
            break
    if is_close or is_render:
        env.close()
    return {'reward':total_r, 'step':step-1}

In [14]:
env=gym.make(env_name)
stats=play(env, policy, is_close=True, is_render=False)
stats

{'reward': -232.26774121543895, 'step': 199}

In [15]:
scores=[]
n_trajectory=20
for i in range(n_trajectory):
    stats=play(env, policy, is_close=True, is_render=False)
    rewards=stats['reward']
    print(f'episode #{i} reward: {rewards:0.2f}')
    scores.append(rewards)

print(f'\n score: {np.mean(scores):0.2f} +- {np.std(scores):0.2f}')

episode #0 reward: -1.47
episode #1 reward: -126.49
episode #2 reward: -1.50
episode #3 reward: -1.30
episode #4 reward: -119.41
episode #5 reward: -250.78
episode #6 reward: -224.71
episode #7 reward: -125.62
episode #8 reward: -236.14
episode #9 reward: -124.98
episode #10 reward: -1.68
episode #11 reward: -376.89
episode #12 reward: -1.06
episode #13 reward: -248.21
episode #14 reward: -258.34
episode #15 reward: -234.08
episode #16 reward: -333.04
episode #17 reward: -116.63
episode #18 reward: -119.59
episode #19 reward: -119.33

 score: -151.06 +- 112.40
