# SIT796 Task 3.1D
Brenton Adey
222165064

In [25]:
import numpy as np
from numpy import pi, sin, cos

import random

# Define Dynamics of Acrobot

In [26]:
# Define constants of for Acrobot Components
dt = 0.2
LINK_LENGTH_1 = 1.0  # [m]
LINK_LENGTH_2 = 1.0  # [m]
LINK_MASS_1 = 1.0  #: [kg] mass of link 1
LINK_MASS_2 = 1.0  #: [kg] mass of link 2
LINK_COM_POS_1 = 0.5  #: [m] position of the center of mass of link 1
LINK_COM_POS_2 = 0.5  #: [m] position of the center of mass of link 2
LINK_MOI = 1.0  #: moments of inertia for both links
MAX_VEL_1 = 4 * pi
MAX_VEL_2 = 9 * pi

In [27]:
def dsdt(state_augemented):
    m1 = LINK_MASS_1
    m2 = LINK_MASS_2
    l1 = LINK_LENGTH_1
    lc1 = LINK_COM_POS_1
    lc2 = LINK_COM_POS_2
    I1 = LINK_MOI
    I2 = LINK_MOI
    g = 9.8
    a = state_augemented[-1]
    s = state_augemented[:-1]
    theta1 = s[0]
    theta2 = s[1]
    dtheta1 = s[2]
    dtheta2 = s[3]
    d1 = (
        m1 * lc1**2
        + m2 * (l1**2 + lc2**2 + 2 * l1 * lc2 * cos(theta2))
        + I1
        + I2
    )
    d2 = m2 * (lc2**2 + l1 * lc2 * cos(theta2)) + I2
    phi2 = m2 * lc2 * g * cos(theta1 + theta2 - pi / 2.0)
    phi1 = (
        -m2 * l1 * lc2 * dtheta2**2 * sin(theta2)
        - 2 * m2 * l1 * lc2 * dtheta2 * dtheta1 * sin(theta2)
        + (m1 * lc1 + m2 * l1) * g * cos(theta1 - pi / 2)
        + phi2
    )
    
    ddtheta2 = (a + d2 / d1 * phi1 - phi2) / (m2 * lc2**2 + I2 - d2**2 / d1)
    ddtheta1 = -(d2 * ddtheta2 + phi1) / d1
    
    return dtheta1, dtheta2, ddtheta1, ddtheta2, 0.0

def rk4(derivs, y0, t):
    """
    Integrate 1-D or N-D system of ODEs using 4-th order Runge-Kutta.
    Example for 2D system:
        >>> def derivs(x):
        ...     d1 =  x[0] + 2*x[1]
        ...     d2 =  -3*x[0] + 4*x[1]
        ...     return d1, d2
        >>> dt = 0.0005
        >>> t = np.arange(0.0, 2.0, dt)
        >>> y0 = (1,2)
        >>> yout = rk4(derivs, y0, t)
    Args:
        derivs: the derivative of the system and has the signature ``dy = derivs(yi)``
        y0: initial state vector
        t: sample times
    Returns:
        yout: Runge-Kutta approximation of the ODE
    """

    try:
        Ny = len(y0)
    except TypeError:
        yout = np.zeros((len(t),), np.float_)
    else:
        yout = np.zeros((len(t), Ny), np.float_)

    yout[0] = y0

    for i in np.arange(len(t) - 1):

        this = t[i]
        dt = t[i + 1] - this
        dt2 = dt / 2.0
        y0 = yout[i]

        k1 = np.asarray(derivs(y0))
        k2 = np.asarray(derivs(y0 + dt2 * k1))
        k3 = np.asarray(derivs(y0 + dt2 * k2))
        k4 = np.asarray(derivs(y0 + dt * k3))
        yout[i + 1] = y0 + dt / 6.0 * (k1 + 2 * k2 + 2 * k3 + k4)
    # We only care about the final timestep and we cleave off action value which will be zero
    return yout[-1][:4]

def wrap(x, m, M):
    """Wraps ``x`` so m <= x <= M; but unlike ``bound()`` which
    truncates, ``wrap()`` wraps x around the coordinate system defined by m,M.\n
    For example, m = -180, M = 180 (degrees), x = 360 --> returns 0.
    Args:
        x: a scalar
        m: minimum possible value in range
        M: maximum possible value in range
    Returns:
        x: a scalar, wrapped
    """
    diff = M - m
    while x > M:
        x = x - diff
    while x < m:
        x = x + diff
    return x

def bound(x, m, M=None):
    """Either have m as scalar, so bound(x,m,M) which returns m <= x <= M *OR*
    have m as length 2 vector, bound(x,m, <IGNORED>) returns m[0] <= x <= m[1].
    Args:
        x: scalar
        m: The lower bound
        M: The upper bound
    Returns:
        x: scalar, bound between min (m) and Max (M)
    """
    if M is None:
        M = m[1]
        m = m[0]
    # bound x between min (m) and Max (M)
    return min(max(x, m), M)

In [28]:
def apply_torque(state, torque):
    state_augemented = np.append(state,torque)
    new_state = rk4(dsdt, state_augemented, [0,dt])

    new_state[0] = wrap(new_state[0], -pi, pi)
    new_state[1] = wrap(new_state[1], -pi, pi)
    new_state[2] = bound(new_state[2], -MAX_VEL_1, MAX_VEL_1)
    new_state[3] = bound(new_state[3], -MAX_VEL_2, MAX_VEL_2)
    
    return new_state

In [29]:
theta1 = random.uniform(-0.1,0.1)
theta2 = random.uniform(-0.1,0.1)
dtheta1 = random.uniform(-0.1,0.1)
dtheta2 = random.uniform(-0.1,0.1)

init_state = np.array([theta1,theta2,dtheta1,dtheta2])

new_state = apply_torque(init_state, 1)

# Discretise Acrobot State Space

In [30]:
# Δθ = 20˚ = π/9rad
# Δdθ = 0.5rad/s
# Number of states = 19*19*51*114 = 2,098,854

def discretise(state, D_angle=(pi/9), D_velocity=0.5):
    state[0] = round(state[0]/(D_angle))*(D_angle)
    state[1] = round(state[1]/(D_angle))*(D_angle)

    state[2] = round(state[2]/(D_velocity))*(D_velocity)
    state[3] = round(state[3]/(D_velocity))*(D_velocity)

    return state

In [31]:
# Example discretise function
discretise([4.5,0.5,5.523,-12.008])

[4.537856055185257, 0.3490658503988659, 5.5, -12.0]

In [32]:
# Define step function (apply torque then discretise)
def step(state, torque):
    return discretise(apply_torque(state, torque))

In [33]:
# Test Function to ensure dynamics can still operate
state = [0,0,0,0]
for _ in range(0,1000):
    state = step(state,random.choice([-1,0,1]))
    print(state)

[ 0.   0.   0.  -0.5]
[ 0.  0.  0. -1.]
[ 0.         -0.34906585  0.         -1.        ]
[ 0.         -0.34906585  0.         -0.5       ]
[ 0.         -0.34906585  0.          0.        ]
[ 0.         -0.34906585  0.          0.5       ]
[ 0.         -0.34906585  0.          1.        ]
[0. 0. 0. 1.]
[0.         0.34906585 0.         1.        ]
[0.         0.34906585 0.         0.5       ]
[0.         0.34906585 0.         0.        ]
[ 0.          0.34906585  0.         -0.5       ]
[ 0.          0.34906585  0.         -1.        ]
[ 0.   0.   0.  -1.5]
[ 0.         -0.34906585  0.         -1.5       ]
[ 0.        -0.6981317  0.        -1.5      ]
[ 0.         -1.04719755  0.         -1.        ]
[ 0.         -1.04719755  0.          0.        ]
[ 0.         -1.04719755  0.          0.5       ]
[ 0.         -1.04719755  0.          1.        ]
[ 0.        -0.6981317  0.         1.5      ]
[ 0.         -0.34906585  0.          2.        ]
[0.  0.  0.  2.5]
[0.         0.34906585 0. 

# Generate Discrete State Space

In [34]:
# Generate variable distributions. Note +0.001 perturbation to include final value
theta1_dist = np.around(np.arange(-pi,pi+0.001,pi/9)/(pi/9))*pi/9
theta2_dist = np.around(np.arange(-pi,pi+0.001,pi/9)/(pi/9))*pi/9
dtheta1_dist = np.around(np.arange(-MAX_VEL_1,MAX_VEL_1,0.5)/0.5)*0.5
dtheta2_dist = np.around(np.arange(-MAX_VEL_2,MAX_VEL_2,0.5)/0.5)*0.5

In [35]:
# Generate all possible combinations of states with the variables
# This code creates the multi-dimensional co-ordinate space, and then combines the co-ordinates to a single 1-D array of 6-D arrays
# Numpy is used for this for efficiency, at the sacrifice of readability
state_space = np.array(np.meshgrid(theta1_dist,theta2_dist,dtheta1_dist,dtheta2_dist)).T.reshape(-1,4)

# Generate Transition Probabilities

Goal is to get a dictionary of the form:
```python
{
    's0': {
        'a0': {'s0': 0.5, 's2': 0.5},
        'a1': {'s2': 1}
    },
    's1': {
        'a0': {'s0': 0.7, 's1': 0.1, 's2': 0.2},
        'a1': {'s1': 0.95, 's2': 0.05}
    },
    's2': {
        'a0': {'s0': 0.4, 's2': 0.6},
        'a1': {'s0': 0.3, 's1': 0.3, 's2': 0.4}
    }
}
```
Where sX is an index for a state and aX is an index for an action.
Note that there will only ever be one available possible future state per state/action pair, so all probabilities will be 1.

In [36]:
# Define action space
action_space = {"a0":-1,"a1":0,"a2":1}

In [132]:
# Apply each action to every possible state
apply_a0 = np.array([step(state,action_space["a0"]) for state in state_space])
apply_a1 = np.array([step(state,action_space["a1"]) for state in state_space])
apply_a2 = np.array([step(state,action_space["a2"]) for state in state_space])

In [107]:
def transform_states_to_posint(states):
    states_transformed = np.empty(states.shape)
    # Transform arrays to ensure all non-negative integer values
    states_transformed[:,0] = (states[:,0]*(180/pi))
    states_transformed[:,1] = (states[:,1]*(180/pi))
    states_transformed[:,2] = (states[:,2]*(2))
    states_transformed[:,3] = (states[:,3]*(2))

    states_transformed[:,0] = states_transformed[:,0] + abs(min(states_transformed[:,0]))
    states_transformed[:,1] = states_transformed[:,1] + abs(min(states_transformed[:,1]))
    states_transformed[:,2] = states_transformed[:,2] + abs(min(states_transformed[:,2]))
    states_transformed[:,3] = states_transformed[:,3] + abs(min(states_transformed[:,3]))

    states_transformed = states_transformed.astype(int)

    return states_transformed

In [119]:
# Now perform index matching along the rows. Due to the size of the data, ravel_multi_index is used to create smaller linear-index unique representations of each 4D state
# See https://stackoverflow.com/questions/38674027/find-the-row-indexes-of-several-values-in-a-numpy-array for more details
def multi_dim_search(X, search_values):
    transformed_X = transform_states_to_posint(X)
    transformed_search_values = transform_states_to_posint(search_values)

    dims = transformed_X.max(0)+2
    X1D = np.ravel_multi_index(transformed_X.T,dims)
    searched_valuesID = np.ravel_multi_index(transformed_search_values.T,dims)
    sidx = X1D.argsort()
    out = sidx[np.searchsorted(X1D,searched_valuesID,sorter=sidx)]

    return out

In [121]:
apply_a0_idx_map = multi_dim_search(state_space, apply_a0)
apply_a1_idx_map = multi_dim_search(state_space, apply_a1)
apply_a2_idx_map = multi_dim_search(state_space, apply_a2)

In [131]:
transition_probs = {
    "s"+str(idx) : {
        "a0" : {"s"+str(apply_a0_idx_map[idx]):1},
        "a1" : {"s"+str(apply_a1_idx_map[idx]):1},
        "a2" : {"s"+str(apply_a2_idx_map[idx]):1}
    }
    for idx, _ in enumerate(state_space)
}

In [None]:
# Find state where Acrobot begins stationary
np.where((state_space == [ 0,  0, 0, 0]).all(axis=1))[0][0]

1058632