# Introduction
This project focuses on implementing and evaluating an offline reinforcement learning (RL) algorithm using the Hopper environment from the MuJoCo suite Gym MuJoCo documentation, which is part of the D4RL library. The Implicit Q-Learning (IQL) algorithm has been implemen ted in PyTorch and tested on the Hopper environment. This project also includes logging and experiment tracking using Weights & Biases (wandb).

# Install Mujuco Physics engine on the kernel

In [3]:
import os
if not os.path.exists('.mujoco_setup_complete'):
  # Get the prereqs
  !apt-get -qq update
  !apt-get -qq install -y libosmesa6-dev libgl1-mesa-glx libglfw3 libgl1-mesa-dev libglew-dev patchelf
  # Get Mujoco
  !mkdir ~/.mujoco
  !wget -q https://mujoco.org/download/mujoco210-linux-x86_64.tar.gz -O mujoco.tar.gz
  !tar -zxf mujoco.tar.gz -C "$HOME/.mujoco"
  !rm mujoco.tar.gz
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  !echo 'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$HOME/.mujoco/mujoco210/bin' >> ~/.bashrc 
  !echo 'export LD_PRELOAD=$LD_PRELOAD:/usr/lib/x86_64-linux-gnu/libGLEW.so' >> ~/.bashrc 
  # THE ANNOYING ONE, FORCE IT INTO LDCONFIG SO WE ACTUALLY GET ACCESS TO IT THIS SESSION
  !echo "/root/.mujoco/mujoco210/bin" > /etc/ld.so.conf.d/mujoco_ld_lib_path.conf
  !ldconfig
  # Install Mujoco-py
  !pip3 install -U 'mujoco-py<2.2,>=2.1'
  # run once
  !touch .mujoco_setup_complete

try:
  if _mujoco_run_once:
    pass
except NameError:
  _mujoco_run_once = False
if not _mujoco_run_once:
  # Add it to the actively loaded path and the bashrc path (these only do so much)
  try:
    os.environ['LD_LIBRARY_PATH']=os.environ['LD_LIBRARY_PATH'] + ':/root/.mujoco/mujoco210/bin'
  except KeyError:
    os.environ['LD_LIBRARY_PATH']='/root/.mujoco/mujoco210/bin'
  try:
    os.environ['LD_PRELOAD']=os.environ['LD_PRELOAD'] + ':/usr/lib/x86_64-linux-gnu/libGLEW.so'
  except KeyError:
    os.environ['LD_PRELOAD']='/usr/lib/x86_64-linux-gnu/libGLEW.so'
  # presetup so we don't see output on first env initialization
  import mujoco_py
  _mujoco_run_once = True
#source of this code block : https://gist.github.com/BuildingAtom/3119ac9c595324c8001a7454f23bf8c8

Then import it :

In [4]:
import mujoco_py

## install D4RL

In [5]:
# intall d4rl github repo
!pip install git+https://github.com/Farama-Foundation/d4rl@master#egg=d4rl

Collecting d4rl
  Cloning https://github.com/Farama-Foundation/d4rl (to revision master) to /tmp/pip-install-fxso1vfe/d4rl_3a4db1e4421347c1990223737cf55404
  Running command git clone --filter=blob:none -q https://github.com/Farama-Foundation/d4rl /tmp/pip-install-fxso1vfe/d4rl_3a4db1e4421347c1990223737cf55404
  Resolved https://github.com/Farama-Foundation/d4rl to commit 71a9549f2091accff93eeff68f1f3ab2c0e0a288
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting mjrl@ git+https://github.com/aravindr93/mjrl@master#egg=mjrl
  Cloning https://github.com/aravindr93/mjrl (to revision master) to /tmp/pip-install-fxso1vfe/mjrl_bdcf91f7a8f14d29890fc33b3d8be13f
  Running command git clone --filter=blob:none -q https://github.com/aravindr93/mjrl /tmp/pip-install-fxso1vfe/mjrl_bdcf91f7a8f14d29890fc33b3d8be13f
  Resolved https://github.com/aravindr93/mjrl to commit 3871d93763d3b49c4741e6daeaebbc605fe140dc
  Preparing metadata (setup.py) ... [?25ldone


## check all the avilable environments

In [7]:
import gym
import d4rl

# List all available environments to see if Maze2D is included
envs = gym.envs.registry.all()
env_names = [env_spec.id for env_spec in envs]
print("Available environments:", env_names)

Available environments: ['CartPole-v0', 'CartPole-v1', 'MountainCar-v0', 'MountainCarContinuous-v0', 'Pendulum-v1', 'Acrobot-v1', 'LunarLander-v2', 'LunarLanderContinuous-v2', 'BipedalWalker-v3', 'BipedalWalkerHardcore-v3', 'CarRacing-v0', 'Blackjack-v1', 'FrozenLake-v1', 'FrozenLake8x8-v1', 'CliffWalking-v0', 'Taxi-v3', 'Reacher-v2', 'Pusher-v2', 'Thrower-v2', 'Striker-v2', 'InvertedPendulum-v2', 'InvertedDoublePendulum-v2', 'HalfCheetah-v2', 'HalfCheetah-v3', 'Hopper-v2', 'Hopper-v3', 'Swimmer-v2', 'Swimmer-v3', 'Walker2d-v2', 'Walker2d-v3', 'Ant-v2', 'Ant-v3', 'Humanoid-v2', 'Humanoid-v3', 'HumanoidStandup-v2', 'FetchSlide-v1', 'FetchPickAndPlace-v1', 'FetchReach-v1', 'FetchPush-v1', 'HandReach-v0', 'HandManipulateBlockRotateZ-v0', 'HandManipulateBlockRotateZTouchSensors-v0', 'HandManipulateBlockRotateZTouchSensors-v1', 'HandManipulateBlockRotateParallel-v0', 'HandManipulateBlockRotateParallelTouchSensors-v0', 'HandManipulateBlockRotateParallelTouchSensors-v1', 'HandManipulateBlockR

## download the dataset for the hopper environment

In [8]:

# Create the environment
env = gym.make('hopper-expert-v2') #gym.make('maze2d-umaze-v1')

print("reset",env.reset())
print("step",env.step(env.action_space.sample()))

#get the dataset for hopper
dataset = env.get_dataset()
dataset.keys()

  "Box bound precision lowered by casting to {}".format(self.dtype)


reset [ 1.25353429e+00 -4.98018573e-03 -2.06140013e-03  7.10537286e-04
 -8.51438047e-04  2.61325936e-03  1.46858319e-03 -4.71780908e-03
 -6.77464173e-05 -3.80985516e-03  6.49444513e-05]
step (array([ 1.25329666e+00, -3.78357411e-03, -5.24973194e-04,  7.48895489e-04,
       -4.18817085e-03,  8.41301716e-03, -6.09106548e-02,  3.02544383e-01,
        3.83899592e-01,  1.10849440e-02, -8.33131008e-01]), 1.0050605256485636, False, {})


load datafile: 100%|██████████| 21/21 [00:01<00:00, 15.31it/s]


dict_keys(['actions', 'infos/action_log_probs', 'infos/qpos', 'infos/qvel', 'metadata/algorithm', 'metadata/iteration', 'metadata/policy/fc0/bias', 'metadata/policy/fc0/weight', 'metadata/policy/fc1/bias', 'metadata/policy/fc1/weight', 'metadata/policy/last_fc/bias', 'metadata/policy/last_fc/weight', 'metadata/policy/last_fc_log_std/bias', 'metadata/policy/last_fc_log_std/weight', 'metadata/policy/nonlinearity', 'metadata/policy/output_distribution', 'next_observations', 'observations', 'rewards', 'terminals', 'timeouts'])

In [9]:
 next_state, reward, done, info=env.step(env.action_space.sample())
print(next_state.shape)


(11,)


## let's check the dataset

In [10]:
print(env.observation_space.shape)
#the observations from the hopper-website
obsers= ["torso_z", "torso_angle", "thigh_angle", "leg_angle", "foot_angle", "velocity_x", "velocity_z", "torso_ang_velocity", 
         "thigh_ang_velocity", "torso_ang_velocity", "foot_ang_velocity"]
print(len(obsers))
#print the dataset attributes
dataset.keys()

(11,)
11


dict_keys(['actions', 'infos/action_log_probs', 'infos/qpos', 'infos/qvel', 'metadata/algorithm', 'metadata/iteration', 'metadata/policy/fc0/bias', 'metadata/policy/fc0/weight', 'metadata/policy/fc1/bias', 'metadata/policy/fc1/weight', 'metadata/policy/last_fc/bias', 'metadata/policy/last_fc/weight', 'metadata/policy/last_fc_log_std/bias', 'metadata/policy/last_fc_log_std/weight', 'metadata/policy/nonlinearity', 'metadata/policy/output_distribution', 'next_observations', 'observations', 'rewards', 'terminals', 'timeouts'])

### attributes of the dataset
* actions: Actions taken by the agent.
* infos/action_log_probs: Log probabilities of the taken actions.
* infos/qpos: Generalized positions of the Hopper's joints and body parts.
* infos/qvel: Generalized velocities of the Hopper's joints and body parts.
* metadata/algorithm: The algorithm used to generate expert data.
* metadata/iteration: Iteration number when the data was collected.
* metadata/policy/fc0/bias: Bias of the first fully connected layer in the policy network.
* metadata/policy/fc0/weight: Weights of the first fully connected layer in the policy network.
* metadata/policy/fc1/bias: Bias of the second fully connected layer in the policy network.
* metadata/policy/fc1/weight: Weights of the second fully connected layer in the policy network.
* metadata/policy/last_fc/bias: Bias of the last fully connected layer in the policy network.
* metadata/policy/last_fc/weight: Weights of the last fully connected layer in the policy network.
* metadata/policy/last_fc_log_std/bias: Bias for the log standard deviation in the last layer.
* metadata/policy/last_fc_log_std/weight: Weights for the log standard deviation in the last layer.
* metadata/policy/nonlinearity: Type of nonlinearity used in the policy network.
* metadata/policy/output_distribution: Distribution type for the policy's output.
* next_observations: Observations after the agent takes an action.
* observations: Observations before the agent takes an action.
* rewards: Rewards received after actions are taken.
* terminals: Indicates if an episode ended (True/False).
* timeouts: Indicates if an episode ended due to a time limit (True/False).


In [11]:
import numpy as np
#unpack the dataset
actions= dataset['actions'] # Actions taken by the agent
observations= dataset['observations'] # The observations or states seen by the agent.
rewards= dataset['rewards'] #The reward received after each action. .reshape(-1, 1)
next_observations= dataset['next_observations'] #next observations after the agent takes an action
terminals= dataset['terminals'] #Indicates if the episode ended in an unhealty way (1 if ended, 0 otherwise) 
timeouts= dataset ['timeouts'] #Indicates if the episode ended due to a timeout.
dones= np.logical_or(terminals, timeouts)

print("shapes of our columns/attributes: actions=", actions.shape, " observations=", observations.shape, " rewards=", rewards.shape)
print("next_observations=", next_observations.shape, " terminals=", terminals.shape," timeouts=", timeouts.shape, "done=", dones.shape )

shapes of our columns/attributes: actions= (1000000, 3)  observations= (1000000, 11)  rewards= (1000000,)
next_observations= (1000000, 11)  terminals= (1000000,)  timeouts= (1000000,) done= (1000000,)


split dataset into train and test 

In [12]:
from sklearn.model_selection import train_test_split
import numpy as np
import torch

train_observations, test_observations, train_actions, test_actions, train_rewards, test_rewards, train_next_observations, test_next_observations,train_dones, test_dones = train_test_split(
    observations, actions, rewards, next_observations, dones, test_size=0.2, random_state=42
)

In [13]:
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, observations, actions, next_observations, rewards, dones):
        super().__init__()
        # Extract data
        self.observations = torch.tensor(observations, dtype=torch.float32)
        self.actions = torch.tensor(actions, dtype=torch.float32)
        self.rewards = torch.tensor(rewards, dtype=torch.float32)
        self.next_observations = torch.tensor(next_observations, dtype=torch.float32)
        self.dones= torch.tensor(dones,dtype= torch.bool )
        #self.timeouts= torch.tensor(timeouts, dtype= torch.bool)
        

    def __len__(self):
        return len(self.observations)

    def __getitem__(self, idx):
        return self.observations[idx], self.actions[idx], self.next_observations[idx], self.rewards[idx], self.dones[idx]

whole_dataset= CustomDataset( observations, actions, rewards, next_observations,  dones)

train_set = CustomDataset(train_observations, train_actions, train_next_observations, train_rewards, train_dones)
test_set = CustomDataset(test_observations, test_actions, test_next_observations, test_rewards, test_dones)
print(len(train_set),len(test_set) , len(whole_dataset))

800000 200000 1000000


create dataloaders

In [14]:
from torch.utils.data import DataLoader
batch_size = 120
train_loader = DataLoader(train_set, batch_size, shuffle=True)
test_loader = DataLoader(test_set, batch_size, shuffle=False)
data_loader= DataLoader(whole_dataset, batch_size, shuffle= True)

# create the agent

### The actor responsible for choosing an action given a specific state "policy network"

In [15]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal

class Actor(nn.Module):
    def __init__(self, state_size, action_size, hidden_size= 32, standard_deviation_min= -10, standard_deviation_max= 10):
        """
        state_size: (input size).
        action_size: (output size).
        hidden_size: Number of neurons in the hidden layers (default 32).
        standard_deviation_min and standard_deviation_max: values used to clamp the log standard deviation of the policy's action distribution to prevent it from becoming too high or too low.
        """
        super(Actor, self).__init__()
        self.standard_deviation_min= standard_deviation_min
        self.standard_deviation_max= standard_deviation_max
        self.lin1= nn.Linear(state_size, hidden_size)
        self.lin2= nn.Linear(hidden_size, hidden_size)
        #output Linear layers to get the mean (mu) and the log standard deviation (log_sigma) of the action distribution.
        self.mu= nn.Linear(hidden_size, action_size) # mean
        self.log_sigma= nn.Linear(hidden_size, action_size) #standard deviation
    
    def forward(self, state):
        x= F.relu(self.lin2(F.relu(self.lin1(state))))
        
        mu = torch.tanh(self.mu(x)) #The mean is passed through a tanh activation to keep the action within a certain range.
        
        log_sigma= self.log_sigma(x)
        log_sigma = torch.clamp(log_sigma, self.standard_deviation_min, self.standard_deviation_max)
        return mu, log_sigma
    
    def evaluate(self, state):
        mu, log_sigma= self.forward(state) 
        sigma= log_sigma.exp()
        normal_distribution= Normal(mu, sigma) #create normal distriution
        
        action= normal_distribution.sample() #get an action from the distriution
        return action, normal_distribution 
    
    def get_action(self, state): #to be used in the evaluation phase
        mu, log_sigma= self.forward(state) 
        sigma= log_sigma.exp()
        normal_distribution= Normal(mu, sigma) #create normal distriution
        
        action= normal_distribution.sample() #get an action from the distriution
        return action.detach().cpu()
    
    def get_deterministic_action(self, state):
        mu, _= self.forward(state)
        return mu.detach().cpu()
        
        
        
        

### the critic estimates the Q-value, which is the expected return for taking a particular action in a given state.

In [16]:
class Critic(nn.Module):
    def __init__(self, state_size, action_size, hidden_size= 32,seed= 1):
        super(Critic, self).__init__()
        torch.manual_seed(1)
        self.lin1= nn.Linear(state_size+action_size, hidden_size)
        self.lin2 = nn.Linear(hidden_size, hidden_size)
        self.lin3= nn.Linear(hidden_size, 1)
        
    def forward(self, state, action):
        #print("Critic", state.shape, action.shape)
        x= torch.cat((state, action), dim=-1)
        #print(x.shape)
        x= F.relu(self.lin2(F.relu(self.lin1(x))))
        return self.lin3(x)

###  the value estimates the expected return from a state, independent of any specific action.

In [17]:
class Value(nn.Module):
    def __init__(self, state_size, hidden_size= 32):
        super(Value, self).__init__()
        self.lin1= nn.Linear(state_size, hidden_size)
        self.lin2= nn.Linear(hidden_size, hidden_size)
        self.lin3= nn.Linear(hidden_size, 1)
        
    def forward(self, state):
        x= F.relu(self.lin2(F.relu(self.lin1(state))))
        return self.lin3(x)

### create the replay buffer

In [18]:
import numpy as np
import random
import torch
from collections import deque, namedtuple

class ReplayBuffer:
    def __init__(self, max_size, batch_size, device):
        self.device= device
        self.memory= deque(maxlen= max_size)
        self.batch_size= batch_size
        #define expericences tuple
        self.experience= namedtuple("Experience", field_names=["state", "action", "reward", "next_state", "done"])
        
    def add(self, state, action, reward, next_state, done):
        e= self.experience(state, action, reward, next_state, done)
        self.memory.append(e)
        
    def sample(self):
        #sample randome experiences from memory to train the model i=on it
        experiences = random.sample(self.memory, k=self.batch_size)
        #unpacked them into seperate tensors
        states = torch.from_numpy(np.stack([e.state for e in experiences if e is not None])).float().to(self.device)
        actions = torch.from_numpy(np.vstack([e.action for e in experiences if e is not None])).float().to(self.device)
        rewards = torch.from_numpy(np.vstack([e.reward for e in experiences if e is not None])).float().to(self.device)
        next_states = torch.from_numpy(np.stack([e.next_state for e in experiences if e is not None])).float().to(self.device)
        dones = torch.from_numpy(np.vstack([e.done for e in experiences if e is not None]).astype(np.uint8)).float().to(self.device)
        return (states, actions, rewards, next_states, terminals,timeouts )

    def __len__(self):
        return len(self.memory)

In [19]:
import torch
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils import clip_grad_norm_

class Implicit_Q_learning(nn.Module):
    def __init__(self, state_size, action_size, lr, hidden_size, tau, temperature, expectile, device):
        super(Implicit_Q_learning, self).__init__()
        self.state_size= state_size
        self.action_size= action_size
        self.lr= lr
        self.hidden_size= hidden_size
        self.tau= tau #update parameter for the target netwrok
        self.device = device
        self.temperature= torch.FloatTensor([temperature]).to(self.device)
        self.expectile= torch.FloatTensor([expectile]).to(self.device)
        self.gamma = torch.FloatTensor([0.99]).to(device) #to compute the Q-values
        self.clip_grad_param = 1

        #____define the neuralnetwroks____
        # Actor_network: outputs actions based on the state.
        self.actor_local= Actor(self.state_size, self.action_size, self.hidden_size).to(self.device)
        self.actor_optimizer= optim.Adam(self.actor_local.parameters(), lr= self.lr)

        # Critic_networks: Two Q-value networks to estimate the value of state-action pairs.
        self.critic1= Critic(self.state_size, self.action_size, self.hidden_size, seed= 2).to(self.device)
        self.critic2= Critic(self.state_size, self.action_size, self.hidden_size, seed= 1).to(self.device)
        assert self.critic1.parameters() != self.critic2.parameters()

        self.critic1_optimizer = optim.Adam(self.critic1.parameters(), lr=self.lr)
        self.critic2_optimizer = optim.Adam(self.critic2.parameters(), lr=self.lr) 

        #Target networks for stability, which slowly track the critic networks.
        self.critic1_target = Critic(self.state_size, self.action_size, self.hidden_size).to(self.device)
        self.critic1_target.load_state_dict(self.critic1.state_dict())

        self.critic2_target = Critic(self.state_size, self.action_size, self.hidden_size).to(self.device)
        self.critic2_target.load_state_dict(self.critic2.state_dict())

        #Value_network: estimates the value of a state independently of the action.
        self.value_net = Value(self.state_size, self.hidden_size).to(self.device)
        self.value_optimizer = optim.Adam(self.value_net.parameters(), lr=self.lr)
        
    def get_action(self, state, eval= False):
        state= torch.from_numpy(state).float().to(self.device)
        with torch.no_grad():
            if eval:
                action = self.actor_local.get_deterministic_action(state)
            else:
                action = self.actor_local.get_action(state)
        return action.numpy()
    
    def calc_policy_loss(self, states, actions):
        with torch.no_grad():
            v = self.value_net(states)
            q1 = self.critic1_target(states, actions)
            q2 = self.critic2_target(states, actions)
            min_Q = torch.min(q1,q2)

        exp_a = torch.exp((min_Q - v) * self.temperature)
        exp_a = torch.min(exp_a, torch.FloatTensor([100.0]).to(states.device))

        _, dist = self.actor_local.evaluate(states)
        log_probs = dist.log_prob(actions)
        actor_loss = -(exp_a * log_probs).mean()

        return actor_loss
    
    def calc_value_loss(self, states, actions):
        with torch.no_grad():
            q1 = self.critic1_target(states, actions)   
            q2 = self.critic2_target(states, actions)
            min_Q = torch.min(q1,q2)
        
        value = self.value_net(states)
        value_loss = loss(min_Q - value, self.expectile).mean()
        return value_loss
    
    def calc_q_loss(self, states, actions, rewards, dones, next_states):
        with torch.no_grad():
            next_v = self.value_net(next_states)
            dones = dones.float()
            q_target = rewards + (self.gamma * (1 - dones) * next_v) 
            #q_target = rewards + (self.gamma * (1 - terminals) * (1 - timeouts) * next_v)


        q1 = self.critic1(states, actions)
        q2 = self.critic2(states, actions)
        critic1_loss = ((q1 - q_target)**2).mean() 
        critic2_loss = ((q2 - q_target)**2).mean()
        return critic1_loss, critic2_loss


    def learn(self, experiences):
        states, actions, rewards, next_states,  dones  = experiences

        self.value_optimizer.zero_grad()
        #print(states, actions)
        #print(states.shape, actions.shape)
        value_loss = self.calc_value_loss(states, actions)
        value_loss.backward()
        self.value_optimizer.step()

        actor_loss = self.calc_policy_loss(states, actions)
        self.actor_optimizer.zero_grad()
        actor_loss.backward()
        self.actor_optimizer.step()
        
        critic1_loss, critic2_loss = self.calc_q_loss(states, actions, rewards, dones, next_states)

        # critic 1
        self.critic1_optimizer.zero_grad()
        critic1_loss.backward()
        clip_grad_norm_(self.critic1.parameters(), self.clip_grad_param)
        self.critic1_optimizer.step()
        # critic 2
        self.critic2_optimizer.zero_grad()
        critic2_loss.backward()
        clip_grad_norm_(self.critic2.parameters(), self.clip_grad_param)
        self.critic2_optimizer.step()

        # ----------------------- update target networks ----------------------- #
        self.soft_update(self.critic1, self.critic1_target)
        self.soft_update(self.critic2, self.critic2_target)
        
        return actor_loss.item(), critic1_loss.item(), critic2_loss.item(), value_loss.item()

    def soft_update(self, local_model , target_model):
        """Soft update model parameters.
        θ_target = τ*θ_local + (1 - τ)*θ_target
        Params
        ======
            local_model: PyTorch model (weights will be copied from)
            target_model: PyTorch model (weights will be copied to)
            tau (float): interpolation parameter 
        """
        for target_param, local_param in zip(target_model.parameters(), local_model.parameters()):
            target_param.data.copy_(self.tau*local_param.data + (1.0-self.tau)*target_param.data)


            
            
            
            

In [20]:
#hyperparameters
import torch
import numpy as np
import random


run_name= "IQL"
env_name= "hopper-expert-v2" # environment name
episodes= 100 #number of episodes
seed= 1 
log_video= 0  #"Log agent behaviour to wanbd when set to 1, default: 0"
save_every= 100 #save the network every x epochs
batch_size = 120
hidden_size= 256
learning_rate= 3e-4
temperature= 3
expectile= 0.7
tau= 5e-3
eval_every= 1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

#define the train loader
dataloader= DataLoader(whole_dataset, batch_size, shuffle= True)

#the training loop
#set the seeds
np.random.seed(seed)
random.seed(seed)
torch.manual_seed(seed)
#define an environment
env = gym.make('hopper-expert-v2') 
#set the seed for the action space
env.action_space.seed(seed)

batches = 0
average10 = deque(maxlen=10)


  "Box bound precision lowered by casting to {}".format(self.dtype)


In [22]:
import torch
import os
import wandb
import random

def save(args, save_name, model, ep=None):
    save_dir = './trained_models/' 
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    
    save_path = os.path.join(save_dir, run_name + save_name)
    if ep is not None:
        save_path += str(ep) + ".pth"
    else:
        save_path += ".pth"
    
    torch.save(model.state_dict(), save_path)
    wandb.save(save_path)

def collect_random(env, dataset, num_samples=200):
    state = env.reset()
    for _ in range(num_samples):
        action = env.action_space.sample()
        next_state, reward, termi, _ = env.step(action)
        dataset.add(state, action, reward, next_state, done)
        state = next_state
        if terminals or timeout:
            state = env.reset()

def loss(diff, expectile=0.8):
    weight = torch.where(diff > 0, expectile, (1 - expectile))
    return weight * (diff**2)


'\n\n# Simulate training\nepochs = 10\noffset = random.random() / 5\nfor epoch in range(2, epochs):\n    acc = 1 - 2 ** -epoch - random.random() / epoch - offset\n    loss_value = 2 ** -epoch + random.random() / epoch + offset\n\n    # Log metrics to WandB\n    if wandb:\n        wandb.log({"acc": acc, "loss": loss_value})\n\n# [optional] finish the WandB run, necessary in notebooks\nif wandb:\n    wandb.finish()\n'

In [23]:
def evaluate(env, policy, eval_runs=5): 
    """
    Makes an evaluation run with the current policy
    """
    reward_batch = []
    for i in range(eval_runs):
        state = env.reset()

        rewards = 0
        while True:
            action = policy.get_action(state, eval=True)

            state, reward, done, _ = env.step(action)
            rewards += reward
            if done:
                break
        reward_batch.append(rewards)
    return np.mean(reward_batch)

In [24]:
! pip install protobuf==3.20.0



In [25]:
!pip install wandb



In [1]:
import wandb
wandb.login(key= "512e30c6b6b9ff00a20d3fec64a6cab8320827df")

[34m[1mwandb[0m: W&B API key is configured (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [28]:
"""# Start a W&B Run with wandb.init
run = wandb.init(project="offline-RL", config={"architecture": "IQL","dataset": "hopper-expert-v2"})
if run:
    print("state_size= ", env.observation_space.shape[0])
    print("action_size= ", env.action_space.shape[0])
    agent = Implicit_Q_learning(state_size=env.observation_space.shape[0],
            action_size=env.action_space.shape[0],
            lr=learning_rate,
            hidden_size=hidden_size,
            tau=tau,
            temperature=temperature,
            expectile=expectile,
            device=device)
    run.watch(agent, log="gradients", log_freq=10)
    if log_video:
        env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x%10==0, force=True)
    eval_reward = evaluate(env, agent)
    run.log({"Test Reward": eval_reward, "Episode": 0, "Batches": batches}, step=batches)
    for i in range(1, episodes+1):
        for batch_idx, experience in enumerate(dataloader):
            states, actions, rewards, next_states, dones = experience
            states = states.to(device)
            actions = actions.to(device)
            rewards = rewards.to(device)
            next_states = next_states.to(device)
            dones = dones.to(device)
            policy_loss, critic1_loss, critic2_loss, value_loss = agent.learn((states, actions, rewards, next_states, dones))
            batches += 1
            
        if i % eval_every == 0:
            eval_reward = evaluate(env, agent)
            run.log({"Test Reward": eval_reward, "Episode": i, "Batches": batches}, step=batches)

            average10.append(eval_reward)
            print("Episode: {} | Reward: {} | Polciy Loss: {} | Batches: {}".format(i, eval_reward, policy_loss, batches))
            
        run.log({
            "Average10": np.mean(average10),
            "Policy Loss": policy_loss,
            "Value Loss": value_loss,
            "Critic 1 Loss": critic1_loss,
            "Critic 2 Loss": critic2_loss,
            "Batches": batches,
            "Episode": i})
        if (i %10 == 0) and log_video:
            mp4list = glob.glob('video/*.mp4')
            if len(mp4list) > 1:
                mp4 = mp4list[-2]
                run.log({"gameplays": run.Video(mp4, caption='episode: '+str(i-10), fps=4, format="gif"), "Episode": i})

  
"""

'# Start a W&B Run with wandb.init\nrun = wandb.init(project="offline-RL", config={"architecture": "IQL","dataset": "hopper-expert-v2"})\nif run:\n    print("state_size= ", env.observation_space.shape[0])\n    print("action_size= ", env.action_space.shape[0])\n    agent = Implicit_Q_learning(state_size=env.observation_space.shape[0],\n            action_size=env.action_space.shape[0],\n            lr=learning_rate,\n            hidden_size=hidden_size,\n            tau=tau,\n            temperature=temperature,\n            expectile=expectile,\n            device=device)\n    run.watch(agent, log="gradients", log_freq=10)\n    if log_video:\n        env = gym.wrappers.Monitor(env, \'./video\', video_callable=lambda x: x%10==0, force=True)\n    eval_reward = evaluate(env, agent)\n    run.log({"Test Reward": eval_reward, "Episode": 0, "Batches": batches}, step=batches)\n    for i in range(1, episodes+1):\n        for batch_idx, experience in enumerate(dataloader):\n            states, a

In [31]:
#add saving the bext models
# Start a W&B Run with wandb.init
run = wandb.init(project="offline-RL", config={"architecture": "IQL", "dataset": "hopper-expert-v2"})
best_eval_reward = -float("inf")
best_model = {}

if run:
    print("state_size= ", env.observation_space.shape[0])
    print("action_size= ", env.action_space.shape[0])
    
    agent = Implicit_Q_learning(state_size=env.observation_space.shape[0],
                                action_size=env.action_space.shape[0],
                                lr=learning_rate,
                                hidden_size=hidden_size,
                                tau=tau,
                                temperature=temperature,
                                expectile=expectile,
                                device=device)
                                
    run.watch(agent, log="gradients", log_freq=10)
    
    if log_video:
        env = gym.wrappers.Monitor(env, './video', video_callable=lambda x: x % 10 == 0, force=True)
    
    eval_reward = evaluate(env, agent)
    run.log({"Test Reward": eval_reward, "Episode": 0, "Batches": batches}, step=batches)
    
    for i in range(1, episodes + 1):
        for batch_idx, experience in enumerate(dataloader):
            states, actions, rewards, next_states, dones = experience
            states = states.to(device)
            actions = actions.to(device)
            rewards = rewards.to(device)
            next_states = next_states.to(device)
            dones = dones.to(device)
            
            policy_loss, critic1_loss, critic2_loss, value_loss = agent.learn((states, actions, rewards, next_states, dones))
            batches += 1
            
        if i % eval_every == 0:
            eval_reward = evaluate(env, agent)
            run.log({"Test Reward": eval_reward, "Episode": i, "Batches": batches}, step=batches)

            average10.append(eval_reward)
            print("Episode: {} | Reward: {} | Policy Loss: {} | Batches: {}".format(i, eval_reward, policy_loss, batches))
            
            # Save the best model
            if eval_reward > best_eval_reward:
                best_eval_reward = eval_reward
                best_model['actor_local'] = agent.actor_local.state_dict()
                best_model['critic1'] = agent.critic1.state_dict()
                best_model['critic2'] = agent.critic2.state_dict()
                best_model['critic1_target'] = agent.critic1_target.state_dict()
                best_model['critic2_target'] = agent.critic2_target.state_dict()
                best_model['value'] = agent.value_net.state_dict()
            
        run.log({
            "Average10": np.mean(average10),
            "Policy Loss": policy_loss,
            "Value Loss": value_loss,
            "Critic 1 Loss": critic1_loss,
            "Critic 2 Loss": critic2_loss,
            "Batches": batches,
            "Episode": i})
        
        if (i % 10 == 0) and log_video:
            mp4list = glob.glob('video/*.mp4')
            if len(mp4list) > 1:
                mp4 = mp4list[-2]
                run.log({"gameplays": run.Video(mp4, caption='episode: ' + str(i - 10), fps=4, format="gif"), "Episode": i})
    
    # Save the best model at the end
    torch.save(best_model, "best_model.pth")
    run.log_artifact("best_model.pth", type="model")

# End the run
run.finish()


VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
Batches,▁█
Episode,▁█
Test Reward,▁█

0,1
Batches,16668.0
Episode,1.0
Test Reward,86.3021


[34m[1mwandb[0m: wandb version 0.17.6 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


state_size=  11
action_size=  3
Episode: 1 | Reward: 81.95221891158778 | Policy Loss: 0.05428459867835045 | Batches: 25002
Episode: 2 | Reward: 209.2821315378973 | Policy Loss: 0.022939754649996758 | Batches: 33336
Episode: 3 | Reward: 59.74656674180674 | Policy Loss: 0.11758668720722198 | Batches: 41670
Episode: 4 | Reward: 449.2906111652854 | Policy Loss: -0.03548737242817879 | Batches: 50004
Episode: 5 | Reward: 221.97566114436904 | Policy Loss: 0.4225108325481415 | Batches: 58338
Episode: 7 | Reward: 505.251053772187 | Policy Loss: -0.41293153166770935 | Batches: 75006
Episode: 8 | Reward: 441.288797620247 | Policy Loss: -0.02684936672449112 | Batches: 83340
Episode: 9 | Reward: 435.57832213471977 | Policy Loss: -0.7155187726020813 | Batches: 91674
Episode: 10 | Reward: 442.3668518670248 | Policy Loss: -0.3005243241786957 | Batches: 100008
Episode: 11 | Reward: 751.697295254217 | Policy Loss: -0.14835263788700104 | Batches: 108342
Episode: 12 | Reward: 448.14029274588245 | Policy L

RuntimeError: max must be larger than min

# **Step 2 :** Install Virtual Display

In [None]:
!apt install -y python-opengl ffmpeg > /dev/null 2>&1

# !apt install -y xvfb
%pip install pyvirtualdisplay

Then import it and define some helper functions:

In [None]:
from pyvirtualdisplay import Display
display = Display(visible=0, size=(400, 300))
display.start()


from matplotlib import pyplot as plt, animation
%matplotlib inline
from IPython import display

def create_anim(frames, dpi, fps):
    plt.figure(figsize=(frames[0].shape[1] / dpi, frames[0].shape[0] / dpi), dpi=dpi)
    patch = plt.imshow(frames[0])
    def setup():
        plt.axis('off')
    def animate(i):
        patch.set_data(frames[i])
    anim = animation.FuncAnimation(plt.gcf(), animate, init_func=setup, frames=len(frames), interval=fps)
    return anim

def display_anim(frames, dpi=10, fps=100):
    anim = create_anim(frames, dpi, fps)
    return anim.to_jshtml()

def save_anim(frames, filename, dpi=60, fps=50):
    anim = create_anim(frames, dpi, fps)
    anim.save(filename)


class trigger:
    def __init__(self):
        self._trigger = True

    def __call__(self, e):
        return self._trigger

    def set(self, t):
        self._trigger = t

# Display Episode Frames In the notebook

Finally render an episode and display it in the notebook :

In [None]:
frames = []
#env = gym.make("Walker2d-v2")
env = gym.make('hopper-expert-v2') 
obs = env.reset()
done = False
episode_reward = 0
while not done:
    frames.append(env.render(mode='rgb_array'))
    obs, rew,done,info = env.step(agent.get_action(obs, eval=True))
    episode_reward += rew
env.close()

print("Episode Reward: " + str(episode_reward))

display.HTML(display_anim(frames))