## IMPORTANT:
initial trajectory is important for final solution, with good initial trajectory it finds solution really easy. This suggests to use the old trajectory initial input to iLQG algorithm

In [1]:
import numpy as np
from drl.ilqg import ilqg, LearnedDynamics
from drl.env.arm import TwoLinkArm

env = TwoLinkArm(g=0., wp=10., wv=1., wu=0.001)

N = 5 # number of future steps for iLQG
Nf = 2 # number of time-steps ahead and after current time-step for fitting linear model
num_episodes = 25
max_steps = 50

model = LearnedDynamics(max_steps, num_episodes, env.state_dim, env.action_dim, Nf)

In [2]:
def dyn(x, u):
    return model.dynamics_function(x, u)

In [3]:
def cst(x, u):
    return env.cost_func(x, u)

In [4]:
x = x0 = env.reset()
goal = env.goal

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Simulate system once
reward = 0.
for i_step in range(max_steps):
    env.render()
    x_new, r, t, _ = env.step(u[i_step,:]) 
    
    model.add(0, i_step, x, u[i_step,:], x_new)
    
    x = x_new    
    reward += r
print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (0, i_step+1, reward, reward/i_step))

# Only use first N control inputs for iLQG estimator
u = u[:N,:]
    
for i_episode in range(1, num_episodes):
    # Fit models
    model.fit()
    
    x = env.reset(x0, goal)
    terminal = False
    i_step = 0
    reward = 0.
    
    for i_step in range(max_steps):
        env.render()
        
        model.set_cur_step(i_step)

        _, u, L, Vx, Vxx, cost = ilqg(dyn, cst, x, u, {})
                
        # Take step
        x_new, r, t, _ = env.step(u[0, :])

        # Add to data matrices
        model.add(i_episode, i_step, x, u[0, :], x_new)
        
        u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim))) 
        
        x = x_new
        reward += r
        i_step += 1
        
        if t:
            break
    
    print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (i_episode, i_step, reward, reward/i_step))

Iter 0, Steps 50, Reward: -1124.38, Average reward: -22.95
Iter 1, Steps 50, Reward: -482.25, Average reward: -9.64
Iter 2, Steps 36, Reward: -392.69, Average reward: -10.91
Iter 3, Steps 32, Reward: -366.56, Average reward: -11.45
Iter 4, Steps 31, Reward: -345.51, Average reward: -11.15
Iter 5, Steps 30, Reward: -342.96, Average reward: -11.43
Iter 6, Steps 29, Reward: -320.07, Average reward: -11.04
Iter 7, Steps 27, Reward: -314.40, Average reward: -11.64
Iter 8, Steps 28, Reward: -316.13, Average reward: -11.29
Iter 9, Steps 27, Reward: -320.48, Average reward: -11.87
Iter 10, Steps 27, Reward: -326.11, Average reward: -12.08
Iter 11, Steps 27, Reward: -320.82, Average reward: -11.88


[2017-06-14 12:56:22,542] 
EXIT: Maximum iterations reached.



Iter 12, Steps 27, Reward: -321.24, Average reward: -11.90


[2017-06-14 12:56:26,918] 
EXIT: Maximum iterations reached.



Iter 13, Steps 27, Reward: -322.53, Average reward: -11.95


[2017-06-14 12:56:31,279] 
EXIT: Maximum iterations reached.



Iter 14, Steps 27, Reward: -323.75, Average reward: -11.99


[2017-06-14 12:56:35,679] 
EXIT: Maximum iterations reached.



Iter 15, Steps 27, Reward: -324.51, Average reward: -12.02


[2017-06-14 12:56:39,939] 
EXIT: Maximum iterations reached.



Iter 16, Steps 27, Reward: -325.19, Average reward: -12.04


[2017-06-14 12:56:44,367] 
EXIT: Maximum iterations reached.



Iter 17, Steps 28, Reward: -326.01, Average reward: -11.64


[2017-06-14 12:56:48,976] 
EXIT: Maximum iterations reached.



Iter 18, Steps 28, Reward: -326.51, Average reward: -11.66


[2017-06-14 12:56:53,372] 
EXIT: Maximum iterations reached.



Iter 19, Steps 28, Reward: -326.92, Average reward: -11.68


[2017-06-14 12:56:57,786] 
EXIT: Maximum iterations reached.



Iter 20, Steps 28, Reward: -327.40, Average reward: -11.69


[2017-06-14 12:57:02,363] 
EXIT: Maximum iterations reached.



Iter 21, Steps 28, Reward: -327.45, Average reward: -11.69


[2017-06-14 12:57:06,768] 
EXIT: Maximum iterations reached.



Iter 22, Steps 28, Reward: -327.91, Average reward: -11.71


[2017-06-14 12:57:11,284] 
EXIT: Maximum iterations reached.



Iter 23, Steps 28, Reward: -328.19, Average reward: -11.72


[2017-06-14 12:57:15,893] 
EXIT: Maximum iterations reached.



Iter 24, Steps 28, Reward: -328.46, Average reward: -11.73


In [5]:
env.render(close=True)