In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from video import VideoRecorder
import dmc2gym

In [2]:
env = dmc2gym.make('point_mass', 'easy')

In [121]:
class ReplayBuffer(object):
    def __init__(self, obs_dim, action_dim, device, capacity):
        self.device = device
        self.capacity = capacity

        self.obses = np.empty((capacity, obs_dim), dtype=np.float32)
        self.next_obses = np.empty((capacity, obs_dim), dtype=np.float32)
        self.actions = np.empty((capacity, action_dim), dtype=np.float32)
        self.rewards = np.empty((capacity, 1), dtype=np.float32)
        self.not_dones = np.empty((capacity, 1), dtype=np.float32)

        self.idx = 0
        self.full = False

    def add(self, obs, action, reward, next_obs, done):
        np.copyto(self.obses[self.idx], obs)
        np.copyto(self.actions[self.idx], action)
        np.copyto(self.rewards[self.idx], reward)
        np.copyto(self.next_obses[self.idx], next_obs)
        np.copyto(self.not_dones[self.idx], not done)

        self.idx = (self.idx + 1) % self.capacity
        self.full = self.full or self.idx == 0

    def sample(self, batch_size):
        idxs = np.random.randint(
            0, self.capacity if self.full else self.idx, size=batch_size)

        obses = torch.as_tensor(self.obses[idxs], device=self.device).float()
        actions = torch.as_tensor(self.actions[idxs], device=self.device)
        rewards = torch.as_tensor(self.rewards[idxs], device=self.device)
        next_obses = torch.as_tensor(
            self.next_obses[idxs], device=self.device).float()
        not_dones = torch.as_tensor(self.not_dones[idxs], device=self.device)

        return obses, actions, rewards, next_obses, not_dones


In [169]:
class TransitionModel(nn.Module):
    def __init__(self, state_dim, control_dim):
        super().__init__()
        
        self.F = nn.Parameter(torch.rand(state_dim, state_dim + control_dim))
        self.f = nn.Parameter(torch.zeros(state_dim))
        
    def forward(self, x, u):
        # f(x, u) = F * [x u]^T + f
        xu = torch.cat([x, u], dim=-1)
        return xu.matmul(self.F.t()) + self.f

In [186]:
class CostModel(nn.Module):
    def __init__(self, state_dim, control_dim):
        super().__init__()
        
        self.C = nn.Parameter(torch.eye(state_dim + control_dim))
        self.C.data[4, 4] = 0.0001
        self.C.data[5, 5] = 0.0001
        self.c = nn.Parameter(torch.zeros(state_dim + control_dim))
        
    def forward(self, x, u):
        # c(x, u) = 0.5 * [x u] * C * [x u]^T + [x u] * c
        xu = torch.cat([x, u], dim=-1)
        return 0.5 * ((xu @ self.C.t()) * xu).sum(dim=1) + xu.matmul(self.c.t())

In [187]:
class LQR(object):
    def __init__(self, state_dim, control_dim):
        self.state_dim = state_dim
        self.control_dim = control_dim
        
    def _backward_pass(self, F, f, C, c, T):
        V = torch.zeros(self.state_dim, self.state_dim)
        v = torch.zeros(self.state_dim)
        Ks, ks = [], []
        N = self.state_dim
        
        for t in range(T - 1, 0, -1):
            Q = C + F.t() @ V @ F
            q = c + F.t() @ V @ f + F.t() @ v

            Qxx, Qxu, Qux, Quu = Q[:N, :N], Q[:N, N:], Q[N:, :N], Q[N:, N:]
            qx, qu = q[:N], q[N:]

            Quu_inv = torch.inverse(Quu) 
            K = - Quu_inv @ Qux
            k = - Quu_inv @ qu

            Ks.append(K)
            ks.append(k)

            V = Qxx + Qxu @ K + K.t() @ Qux + K.t() @ Quu @ K
            v = qx + Qxu @ k + K.t() @ qu + K.t() @ Quu @ k
        
        return reversed(Ks), reversed(ks)
    
    def _forward_pass(self, x, Ks, ks, trans_model):
        us = []
        for K, k in zip(Ks, ks):
            u = K @ x + k
            u = u.clamp(-1, 1)
            us.append(u)
            x = trans_model(x, u)
        return us
        
        
    def plan(self, x, T, trans_model, cost_model):
        F, f = trans_model.F.detach(), trans_model.f.detach()
        C, c = cost_model.C.detach(), cost_model.c.detach()
        
        Ks, ks = self._backward_pass(F, f, C, c, T)
        us = self._forward_pass(x, Ks, ks, trans_model)
        
        return us

In [188]:
state_dim = env.observation_space.shape[0]
control_dim = env.action_space.shape[0]

In [189]:
trans_model = TransitionModel(state_dim, control_dim)
cost_model = CostModel(state_dim, control_dim)
lqr = LQR(state_dim, control_dim)

In [190]:
device = torch.device('cpu')

In [191]:
buffer = ReplayBuffer(state_dim, control_dim, device, 100000)

In [192]:
for _ in range(100):
    x = env.reset()
    u = env.action_space.sample()
    next_x, reward, done, _ = env.step(u)
    buffer.add(x, u, reward, next_x, float(done))
    x = next_x

In [193]:
trans_opt = torch.optim.Adam(trans_model.parameters())
cost_opt = torch.optim.Adam(cost_model.parameters())

In [None]:
for it in range(100):
    
    total_trans_loss = 0
    total_cost_loss = 0
    U = 10000
    for _ in range(U):
        x, u, r, nx, _ = buffer.sample(64)
    
        trans_opt.zero_grad()
        trans_loss = F.mse_loss(trans_model(x, u), nx)
        trans_loss.backward()
        trans_opt.step()

        #cost_opt.zero_grad()
        #cost_loss = F.mse_loss(cost_model(x, u), 1. - r)
        #cost_loss.backward()
        #cost_opt.step()
        
        total_trans_loss += trans_loss.item()
        #total_cost_loss += cost_loss.item()
        
    total_cost_loss /= U
    total_trans_loss /= U
        
    
    video = VideoRecorder(env, enabled=True)
    x = env.reset()
    us = lqr.plan(torch.as_tensor(x).float(), 1000, trans_model, cost_model)
    
    total_reward = 0
    for j, u in enumerate(us):
        
        u = u.detach().numpy().clip(-0.999, 0.999)
        if np.isnan(u[0]):
            break
        next_x, reward, done, _ = env.step(u)
        video.record()
        total_reward += reward
        
        buffer.add(x, u, reward, next_x, float(done))
        x = next_x
        
    video.save('video', '%d.mp4' % it)
    
    
    print('episode: %d step: %d reward: %.3f trans loss: %.3f cost loss: %.3f' % (it, j, total_reward, total_trans_loss, total_cost_loss))
    

episode: 0 step: 998 reward: 0.000 trans loss: 0.006 cost loss: 0.000
episode: 1 step: 998 reward: 668.616 trans loss: 0.000 cost loss: 0.000


In [55]:
trans_model.F

Parameter containing:
tensor([[ 0.7701,  0.0489,  0.1073,  0.1030, -0.0519,  0.1217],
        [ 0.1691,  0.4826,  0.1812, -0.5827, -0.0950,  0.3328],
        [-0.0354, -0.0146,  0.5991,  0.1599,  0.0305, -0.0157],
        [ 0.1713, -0.1617,  0.7018, -0.3640,  0.2524, -0.2258]],
       requires_grad=True)

In [14]:
torch.mm(torch.cat([x, u], dim=-1), tran.F)

RuntimeError: size mismatch, m1: [1 x 6], m2: [4 x 6] at /pytorch/aten/src/TH/generic/THTensorMath.cpp:961

In [15]:
xu.shape

NameError: name 'xu' is not defined

In [34]:
c = CostModel(4, 2)

In [41]:
c(x, u).shape

torch.Size([1, 1])

In [39]:
xu.matmul(c.c.t()).shape

torch.Size([1, 1])