In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2017 NVIDIA Corporation
Built on Fri_Nov__3_21:07:56_CDT_2017
Cuda compilation tools, release 9.1, V9.1.85


In [2]:
!nvidia-smi

Sat Mar 20 12:03:50 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.39       Driver Version: 460.39       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce GTX 107...  Off  | 00000000:01:00.0  On |                  N/A |
|  0%   45C    P8    11W / 180W |    316MiB /  8116MiB |      3%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [3]:
import torch
torch.cuda.is_available()

True

In [5]:
#if the above came up false, follow this
#https://www.cyberciti.biz/faq/ubuntu-linux-install-nvidia-driver-latest-proprietary-driver/
# from this github issue https://github.com/pytorch/pytorch/issues/15612

In [6]:
torch.version.cuda

'9.2'

In [7]:
torch.backends.cudnn.enabled

True

In [6]:
import torch.nn as nn
import torch.nn.functional as F

In [34]:
#Fully connected Q-Network

class FCQV(nn.Module):
    def __init__(self,
                 input_dim,
                 output_dim,
                 hidden_dims=(32, 32),
                 activation_fc=F.relu):
        super(FCQV, self).__init__()
        self.activation_fc = activation_fc
        
        
        #input layer 
        self.input_layer = nn.Linear(input_dim, hidden_dims[0])
        
        #hidden layers
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims) -1):
            in_dim = hidden_dims[i]
            if i == 0:
                in_dim += output_dim
            hidden_layer = nn.Linear(in_dim, hidden_dims[i+1])
            self.hidden_layers.append(hidden_layers)
            
        #output is a single node representing the value of the state-action pair
        self.output_layers = nn.Linear(hidden_dims[-1], 1)
        
        #use a gpu if you have it 
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        #tell the device to work on 
        self.device = torch.device(device)
        self.to(self.device)

        
        #change the data to torch.Tensors if 
        def _format(self, state, action):
            x, u = state, action
            if not isinstance(x, torch.Tensor):
                x = torch.tensor(x, 
                                 device=self.device, 
                                 dtype=torch.float32)
                x = x.unsqueeze(0)
            if not isinstance(u, torch.Tensor):
                u = torch.tensor(u, 
                                 device=self.device, 
                                 dtype=torch.float32)
                u = u.unsqueeze(0)
            return x, u
        
        def forward(self, state, available):
            x, u = self._format(state, action)
            x = self.activation_fc(self.input_layer(x))
            for i , hidden_layer in enumerate(self.hidden_layers):
                #concatenate the actions to the state on the first hidden layer
                if i == 0:
                    #add each row value in u to the corresponding row in x
                    x = torch.cat((x, u), dim = 1)
                x = self.activation_fc(hidden_layer(x))
            return self.output_layer(x)
        
        
        # return each state, action, new_states, rewards, and is terminal
        def load(self, experiences):
            states, actions, new_states, rewards, is_terminal = experiences
            states = torch.from_numpy(states).float().to(self.device)
            actions = torch.from_numpy(actions).float().to(self.device)
            new_states = torch.from_numpy(new_states).float().to(self.device)
            rewards = torch.from_numpy(rewards).float().to(self.device)
            is_terminals = torch.from_numpy(is_terminals).float().to(self.device)
            return states, actions, new_states, rewards, is_terminals
        
                
    

In [12]:
#Fully connected deterministic policy
class FCDP(nn.Module):
    def __init__(self,
                 input_dim,
                 action_bounds,
                 hidden_dims=(32,32),
                 activation_fc=F.relu,
                 out_activation_fc=F.tanh):   #bound the activation between -1 and 1
        super(FCDP, self).__init__()
        self.activation_fc = activation_fc
        self.out_activation_fc = out_activation_fc
        
        # set the min and max so that we can rescale the bounds for the return 
        self.env_min, self.env_max = action_bounds
        
        #states in, actions out
        self.input_layers = nn.Linear(input_dim, hidden_dims[0])
        self.hidden_layers = nn.ModuleList()
        for i in range(len(hidden_dims)-1):
            hidden_layer = nn.Linear(hidden_dims[i], hidden_dims[i+1])
            self.hidden_layers.append(hidden_layer)
            self.output_layer = nn.Linear(hidden_dims[-1], len(self.env_max))
        
        #use a gpu if you have it 
        device = "cpu"
        if torch.cuda.is_available():
            device = "cuda:0"
        self.device = torch.device(device)
        self.to(self.device)
        
        # environmental min and max into tensors
        self.env_min = torch.tensor(self.env_min,
                                    device=self.device, 
                                    dtype=torch.float32)

        self.env_max = torch.tensor(self.env_max,
                                    device=self.device, 
                                    dtype=torch.float32)
        
        #min and max for a continous distribution
        self.nn_min = self.out_activation_fc(
            torch.Tensor([float('-inf')])).to(self.device)
        self.nn_max = self.out_activation_fc(
            torch.Tensor([float('inf')])).to(self.device)
        
        
        self.rescale_fn = lambda x: (x - self.nn_min) * (self.env_max - self.env_min) / \
                                    (self.nn_max - self.nn_min) + self.env_min
    
        #format the state into a tensor if it is not
        def _format(self, state):
            x = state
            if not isinstance(x, torch.Tensor):
                x = torch.tensor(x, 
                                 device=self.device, 
                                 dtype=torch.float32)
                x = x.unsqueeze(0)
            return x

    
    def forward(self, state):
        x = self._format(state) #make sure state is a tensor
        x = self.activation_fc(self.input_layer(x)) #input
        for hidden_layer in self.hidden_layers:   #hidden layers
            x = self.activation_fc(hidden_layer(x)) 
        x = self.output_layer(x) #output
        x = self.out_activation_fc(x) #output activation
        return self.rescale_fn(x) #return the rescale of -1 to 1

In [None]:
class ReplayBuffer():
    def __init__(self,
                 max_size=10000,
                 batch_size=64
                ):
        #initialize the size of the array
        self.ss_mem = np.empty(shape=(max_size), dtype=ndarray)
        self.as_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.rs_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.ps_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        self.ds_mem = np.empty(shape=(max_size), dtype=np.ndarray)
        
        self.max_size = max_size
        self.batch_size = batch_size
        self._idx = 0
        self.size = 0
        
    #store the state, action, reward, 
    def store(self, sample):
        s, a, r, p, d = sample
        self.ss_mem[self._idx] = s
        self.as_mem[self._idx] = a
        self.rs_mem[self._idx] = r
        self.ps_mem[self._idx] = p
        self.ds_mem[self._idx] = d
        
        self._idx += 1
        self._idx = self._idx % self.max_size
        
        #once size variable adds past the max size it will keep growing
        #so the max_size is used after that
        self.size += 1
        self.size = min(self.size, self.max_size)
    
    def sample(self, batch_size=None):
        if batch_size == None:
            batch_size = self.batch_size
        idxs = np.random.choice(
            self.size, batch_size, replace=False)
        experiences = np.vstack(self.ss_mem[idxs]), \
                      np.vstack(self.as_mem[idxs]), \
                      np.vstack(self.rs_mem[idxs]), \
                      np.vstack(self.ps_mem[idxs]), \
                      np.vstack(self.ds_mem[idxs])
        return experiences
    
    def __len__(self):
        return self.size
        

In [13]:
class GreedyStrategy():
    def __init__(self, bounds):
        self.low, self.high = bounds
        self.ratio_noise_injected = 0
        
    def selection_action(self, model, state):
        with torch.no_grad():
            greedy_action = model(state).cpu().detach().data.numpy().squeeze()
            
        action = np.clip(greedy_action, self.low, self.high)
        return np.reshape(action, self.high_shape)

In [14]:
class NormalNoisyDecayStategy():
    def __init__(self, bounds, exploration_noise_ratio=0.1):
    def select_action(self, model, state, max_exploration=False):
        
        #to maximize exploration, we set the noise scale to the maximum action
        if max_exploration:
            noise_scale = self.high
        else:
            noise_scale = self.noise_ratio * self.high
        with torch.no_grad():
            greedy_action = model(state).cpu().detach().data
            greedy_action = greedy_action.numpy().squeeze()
        noise = np.random.normal(loc=0,
                                 scale=noise_scale,
                                 size=len(self.high))
        noisy_action = greedy_action + noise
        action = np.clip(noisy_action, self.low, self.high)
        self.noise_ratio = self._noise_ratio_update()
        return action
        

In [26]:
x = torch.randn(4, 6)
x

tensor([[-0.7776, -1.6559,  1.0647,  0.4161,  1.3967,  1.9610],
        [-0.5973, -0.1176, -0.0510, -1.0470, -0.7622, -0.2222],
        [ 0.8981, -0.0025, -1.0022, -0.1056, -0.0086,  0.9865],
        [-0.2384, -0.6869,  0.0637,  0.3867,  0.7549, -1.0014]])

In [29]:
y = torch.randn(4, 6)
y

tensor([[ 1.6235, -0.8895, -0.1597, -0.5396,  0.1663, -1.3120],
        [-0.6362,  0.9212,  0.0260, -0.4678, -0.5453,  0.4746],
        [ 1.1242,  0.6878, -1.0395,  1.2315,  0.4138, -2.2657],
        [-0.0937, -1.9000, -0.9943,  1.0722,  0.3126,  0.8512]])

In [30]:
torch.cat((x, y, x), 1)

tensor([[-0.7776, -1.6559,  1.0647,  0.4161,  1.3967,  1.9610,  1.6235, -0.8895,
         -0.1597, -0.5396,  0.1663, -1.3120, -0.7776, -1.6559,  1.0647,  0.4161,
          1.3967,  1.9610],
        [-0.5973, -0.1176, -0.0510, -1.0470, -0.7622, -0.2222, -0.6362,  0.9212,
          0.0260, -0.4678, -0.5453,  0.4746, -0.5973, -0.1176, -0.0510, -1.0470,
         -0.7622, -0.2222],
        [ 0.8981, -0.0025, -1.0022, -0.1056, -0.0086,  0.9865,  1.1242,  0.6878,
         -1.0395,  1.2315,  0.4138, -2.2657,  0.8981, -0.0025, -1.0022, -0.1056,
         -0.0086,  0.9865],
        [-0.2384, -0.6869,  0.0637,  0.3867,  0.7549, -1.0014, -0.0937, -1.9000,
         -0.9943,  1.0722,  0.3126,  0.8512, -0.2384, -0.6869,  0.0637,  0.3867,
          0.7549, -1.0014]])