In [1]:
import numpy as np
from drl.ilqg import ilqg
from drl.env.arm import TwoLinkArm
from sklearn.linear_model import LinearRegression
from sklearn import preprocessing

env = TwoLinkArm(g=0.)

dyn = lambda x, u: env.dynamics_func(x, u)[0]
cst = lambda x, u: env.cost_func(x, u)
N = 5 # number of future steps for iLQG
num_episodes = 25
max_steps = 50

## iLQG with learned dynamics



In [None]:
models = []
for i in range(max_steps):
#     models.append(GaussianProcessRegressor())
    models.append(LinearRegression())
    
def dynamics_func(model, x, u):
    u[np.isnan(u)] = 0.
    X_in = np.concatenate((x, u)).reshape(1, -1)
    return models[i].predict(X_in)[0]
    
x = x0 = env.reset()
goal = env.goal
traj_rewards = []

# Initialize random control sequence
u = np.random.randn(max_steps, env.action_dim)

# Initialize data matrices
X = np.zeros((max_steps, num_episodes, env.state_dim + env.action_dim))
Y = np.zeros((max_steps, num_episodes, env.state_dim))

# Simulate systems once
reward = 0.
for i_step in range(max_steps):
    env.render()
    X[i_step,0,:] = np.concatenate((x, u[i,:]))
    x, r, t, _ = env.step(u[i,:]) 
    Y[i_step,0,:] = x
    reward += r
traj_rewards.append(reward)
print('Iter %d, Steps %d, Reward: %s' % (0, i_step+1, reward))
    
# Only use first N control inputs for iLQG estimator
u = u[:N,:]
    
for i_episode in range(1, num_episodes):
    # Fit models
    
#     # TODO: Add scaled weights for better fitted models
#     if len(traj_rewards) > 5:
#         sample_weights = preprocessing.scale(traj_rewards)
#         sample_weights -= np.min(sample_weights) - 0.5
#     else:
#         sample_weights = [1.]*len(traj_rewards)
        
    for i in range(max_steps-N):
        x_tmp = X[i:i+N,:,:]
        x_tmp = np.reshape(x_tmp, [x_tmp.shape[0]*x_tmp.shape[1], x_tmp.shape[2]])
        y_tmp = Y[i:i+N,:,:]
        y_tmp = np.reshape(y_tmp, [y_tmp.shape[0]*y_tmp.shape[1], y_tmp.shape[2]])
        models[i].fit(x_tmp, y_tmp)
    
    for i in range(max_steps-N, max_steps):
        x_tmp = X[i:,:,:]
        x_tmp = np.reshape(x_tmp, [x_tmp.shape[0]*x_tmp.shape[1], x_tmp.shape[2]])
        y_tmp = Y[i:,:,:]
        y_tmp = np.reshape(y_tmp, [y_tmp.shape[0]*y_tmp.shape[1], y_tmp.shape[2]])
        models[i].fit(x_tmp, y_tmp)
    
    x = env.reset(x0, goal)
    terminal = False
    i_step = 0
    reward = 0.
    
    for i_step in range(max_steps):
        env.render()
        
        # iLQG estimate with fitted dynamics
        dyn = lambda x, u: dynamics_func(models[i_step], x, u)

        _, u, L, Vx, Vxx, cost = ilqg(dyn, cst, x, u, {})
        
        # Take step
        x_new, r, t, _ = env.step(u[0, :])

        # Add to data matrices
        X[i_step,i_episode,:] = np.concatenate((x, u[0,:]))
        Y[i_step,i_episode,:] = x_new
        
        u = np.concatenate((u[1:,:], np.random.randn(1, env.action_dim))) 
        
        x = x_new
        reward += r
        i_step += 1
        
        if t:
            break
    
    traj_rewards.append(reward)
    print('Iter %d, Steps %d, Reward: %s' % (i_episode, i_step, reward))

Iter 0, Steps 49, Reward: -256.456878756
Iter 1, Steps 50, Reward: -256.844986448
Iter 2, Steps 50, Reward: -252.365384574
Iter 3, Steps 50, Reward: -309.188827728
Iter 4, Steps 50, Reward: -169.592953264
Iter 5, Steps 50, Reward: -123.873524611
Iter 6, Steps 50, Reward: -88.7034499146
Iter 7, Steps 50, Reward: -66.0021048846
Iter 8, Steps 50, Reward: -68.7185666591
Iter 9, Steps 50, Reward: -68.7741018316
Iter 10, Steps 50, Reward: -68.7959687432
Iter 11, Steps 50, Reward: -68.8058209097
Iter 12, Steps 50, Reward: -68.8101041238
Iter 13, Steps 50, Reward: -68.8115437472
Iter 14, Steps 50, Reward: -68.8118392537
Iter 15, Steps 50, Reward: -68.8111520068
Iter 16, Steps 50, Reward: -68.8099096476
Iter 17, Steps 50, Reward: -68.8085317681
Iter 18, Steps 50, Reward: -68.8071201088
Iter 19, Steps 50, Reward: -68.8057478368


In [3]:
env.render(close=True)