## IMPORTANT:
initial trajectory is important for final solution, with good initial trajectory it finds solution really easy. This suggests to use the old trajectory initial input to iLQG algorithm

In [None]:
import numpy as np
from drl.ilqg import ilqg, LearnedDynamics
from drl.env.arm import TwoLinkArm

env = TwoLinkArm(g=0., wp=10., wv=1., wu=0.001)

N = 5 # number of future steps for iLQG
Nf = 2 # number of time-steps ahead and after current time-step for fitting linear model
num_episodes = 25
max_steps = 75

full_state = True

model = LearnedDynamics(max_steps, num_episodes, env.state_dim, env.action_dim, Nf)

In [None]:
x = env.reset(full_state=full_state)
x0 = env.q
goal = env.goal

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Simulate system once
reward = 0.
for i_step in range(max_steps):
    env.render()
    
    x_new, r, t, _ = env.step(u[i_step,:], full_state=full_state) 
    
    model.add(0, i_step, x, u[i_step,:], x_new)
    
    x = x_new    
    reward += r
print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (0, i_step+1, reward, reward/i_step))

# Only use first N control inputs for iLQG estimator
u = u[:N,:]
    
for i_episode in range(1, num_episodes):
    # Fit models
    model.fit()
    
    x = env.reset(x0, goal, full_state=full_state)
    terminal = False
    i_step = 0
    reward = 0.
    
    for i_step in range(max_steps):
        env.render()
        
        model.set_cur_step(i_step)

        _, u, L, Vx, Vxx, cost = ilqg(model.dynamics_func, env.cost_func, x, u, {})
                
        # Take step
        x_new, r, t, _ = env.step(u[0, :], full_state=full_state)

        # Add to data matrices
        model.add(i_episode, i_step, x, u[0, :], x_new)
        
        u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim))) 
        
        x = x_new
        reward += r
        i_step += 1
        
        if t:
            break
    
    print('Iter %d, Steps %d, Reward: %.2f, Average reward: %.2f' % (i_episode, i_step, reward, reward/i_step))

Iter 0, Steps 75, Reward: -1414.62, Average reward: -19.12
Iter 1, Steps 75, Reward: -1343.52, Average reward: -17.91
Iter 2, Steps 75, Reward: -1118.22, Average reward: -14.91
Iter 3, Steps 75, Reward: -893.76, Average reward: -11.92
Iter 4, Steps 75, Reward: -870.66, Average reward: -11.61
Iter 5, Steps 75, Reward: -825.87, Average reward: -11.01
Iter 6, Steps 75, Reward: -829.02, Average reward: -11.05
Iter 7, Steps 75, Reward: -860.85, Average reward: -11.48
Iter 8, Steps 75, Reward: -824.78, Average reward: -11.00
Iter 9, Steps 75, Reward: -877.88, Average reward: -11.71
Iter 10, Steps 75, Reward: -849.41, Average reward: -11.33
Iter 11, Steps 75, Reward: -878.88, Average reward: -11.72
Iter 12, Steps 75, Reward: -869.86, Average reward: -11.60
Iter 13, Steps 75, Reward: -846.31, Average reward: -11.28
Iter 14, Steps 75, Reward: -867.16, Average reward: -11.56
Iter 15, Steps 75, Reward: -848.68, Average reward: -11.32
Iter 16, Steps 75, Reward: -844.86, Average reward: -11.26
Iter

In [None]:
env.render(close=True)