# Random Cartpole

Original environment: https://github.com/openai/gym/blob/master/gym/envs/classic_control/cartpole.py

###  Observation:

Type: Box(4)

| Num | Observation | Min | Max |
|:---:|:-----------:|:---:|:---:|
| 0   | Cart Position|-4.8|4.8|
| 1   | Cart Velocity|-Inf|Inf|
| 2   | Pole Angle   | -0.418 rad (-24 deg)|0.418 rad (24 deg)|
| 3   | Pole Angular Velocity|-Inf|Inf|
        
        
### Actions:

Type: Discrete(2)

| Num | Action                  |
|:---:|:-----------------------:|
|  0  |  Push cart to the left  |
|  1  |  Push cart to the right |

Note: The amount the velocity that is reduced or increased is not
fixed; it depends on the angle the pole is pointing. This is because
the center of gravity of the pole increases the amount of energy needed
to move the cart underneath it
### Reward:
        Reward is 1 for every step taken, including the termination step


In [None]:
%matplotlib inline

In [None]:
# Do you already have these?
#!pip install gym pyglet

In [None]:
import gym
import numpy as np
from sklearn.linear_model import LinearRegression

In [None]:
env = gym.make("CartPole-v1")

In [None]:
observation = env.reset()
print(observation)

In [None]:
## This should project s into a higher space, or itself
def phi(s):
  return s

In [None]:
def state_reward(s,env):
    """ Compute the reward for a given state
    
        The environment does not directly provide a way to 
        compute the reward for a state, but from the cartpole
        code we can deduce how it's done by default.
    """
    x, x_dot, theta, theta_dot = s
       
    done = bool(
        x < -env.x_threshold
        or x > env.x_threshold
        or theta < -env.theta_threshold_radians
        or theta > env.theta_threshold_radians
    )
    
    if not done:
        if np.abs(x) < env.x_threshold/10 and np.abs(theta_dot) < 0.1:
            reward = 0.0
        else:
            reward = 1.0
    else:
        reward = -1.0
        
    return reward
    


# Learn the fitted value function

In [None]:
env.reset()
k=1 # cartpole seems deterministic... 1 should work
q=np.array([0,0])
gamma=0.99
epsilon=0.0005
max_iterations=100

# Total number of random state samples
m = 15000

# Sample m states
states=np.random.uniform(-1,1,size=(m,4))
states = states * [4.8,2,0.4,1] # Magic numbers from the env ranges

yi=np.zeros((m,1))

# Linear regression used for fitting the value function
reg=LinearRegression(normalize=True)

# Force an initial dummy estimation of the parameters
reg.fit(states,yi) ## With yi=0 then the coef_=0

# Container for the Following states
sprime=np.ndarray((k,4))

iteration=0
converged=False
while not converged:
    states=np.random.uniform(-1,1,size=(m,4))
    states = states * [4.8,2,0.4,1] # Magic numbers from the env ranges
    
    for i in np.arange(m):  # for all initial random states
        s=list(states[i,:])
        
        for a in [0,1]:   # for all actions         
            for l in np.arange(k): # for all samples from the start state
                env.state=s # Force the initial state on the simulator
                state_next, reward, done, info = env.step(a)
                if done:
                    env.reset()
                sprime[l,:]=np.array(state_next)
                
            q[a]=state_reward(s,env) + gamma*np.mean(reg.predict(sprime))
     
        if q[0]==q[1]:
            yi[i]=np.random.randint(2)
        else:
            yi[i]=np.argmax(q)
                    
    lastCoefs=reg.coef_.copy()
    
    #print(states)
    #print(yi)
    
    reg.fit(states,yi)
       
    coefDist = np.linalg.norm(lastCoefs-reg.coef_)        
    print(iteration,"Distance: ",coefDist,flush=True)

    iteration=iteration+1
    converged= coefDist < epsilon or iteration>=max_iterations

    
    
print("Learning finished")

# Using the learned value function

In [None]:
max_episodes=100
episode = 0

average_steps = 0
filter=0.9

while episode < max_episodes:

    episode += 1
    state = env.reset()

    step=0
    while True:
        step += 1
        s=env.state ## Save current state
        
        for a in [0,1]:   # for all actions    
            env.state=s # Reset the current state
            state_next, reward, done, info = env.step(a)
            if done:
               env.reset()

            sprime=np.array(state_next).reshape(1,4)

            q[a]=reg.predict(sprime)

        # Find out which action was predicted best
        if q[0]==q[1]:
            a=np.random.randint(2)
        else:
            a=np.argmax(q)

        env.state=s; # Restore the current state
        env.render(); # Show it

        # And apply the selected action
        state_next, reward, done, info = env.step(a)
        
        if done:
            average_steps = average_steps*filter + (1.0-filter)*step
            print("Episode: " + str(episode) +
                  ", step: " + str(step) +
                  ", average steps: " + str(average_steps))

            break
