## IMPORTANT:
initial trajectory is important for final solution, with good initial trajectory it finds solution really easy. This suggests to use the old trajectory initial input to iLQG algorithm

In [1]:
import numpy as np
from drl.ilqg import ilqg, LearnedDynamics
from drl.env.arm import TwoLinkArm

env = TwoLinkArm(g=0., wp=10., wv=1., wu=0.001)

N = 5 # number of future steps for iLQG
Nf = 2 # number of time-steps ahead and after current time-step for fitting linear model
num_episodes = 25
max_steps = 50

model = LearnedDynamics(max_steps, num_episodes, env.state_dim, env.action_dim, Nf)

In [2]:
def dyn(x, u):
    return model.dynamics_function(x, u)

In [3]:
def cst(x, u):
    return env.cost_func(x, u)

In [4]:
x = x0 = env.reset()
goal = env.goal

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Simulate system once
reward = 0.
for i_step in range(max_steps):
    env.render()
    x_new, r, t, _ = env.step(u[i_step,:]) 
    
    model.add(0, i_step, x, u[i_step,:], x_new)
    
    x = x_new    
    reward += r
print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (0, i_step+1, reward, reward/i_step))

# Only use first N control inputs for iLQG estimator
u = u[:N,:]
    
for i_episode in range(1, num_episodes):
    # Fit models
    model.fit()
    
    x = env.reset(x0, goal)
    terminal = False
    i_step = 0
    reward = 0.
    
    for i_step in range(max_steps):
        env.render()
        
        model.set_cur_step(i_step)

        _, u, L, Vx, Vxx, cost = ilqg(dyn, cst, x, u, {})
                
        # Take step
        x_new, r, t, _ = env.step(u[0, :])

        # Add to data matrices
        model.add(i_episode, i_step, x, u[0, :], x_new)
        
        u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim))) 
        
        x = x_new
        reward += r
        i_step += 1
        
        if t:
            break
    
    print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (i_episode, i_step, reward, reward/i_step))

Iter 0, Steps 50, Reward: -50.00, Average reward: -1.02
Iter 1, Steps 50, Reward: -50.00, Average reward: -1.00
Iter 2, Steps 19, Reward: 81.00, Average reward: 4.26
Iter 3, Steps 24, Reward: 76.00, Average reward: 3.17
Iter 4, Steps 23, Reward: 77.00, Average reward: 3.35
Iter 5, Steps 23, Reward: 77.00, Average reward: 3.35


KeyboardInterrupt: 

In [None]:
env.render(close=True)