<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Reinforcement-Learning--Functions" data-toc-modified-id="Reinforcement-Learning--Functions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Reinforcement Learning  Functions</a></span></li><li><span><a href="#CartPole" data-toc-modified-id="CartPole-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>CartPole</a></span></li><li><span><a href="#MountainCar" data-toc-modified-id="MountainCar-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>MountainCar</a></span></li></ul></div>

# Reinforcement Learning - Random and Linear Regression
Made by Carlo di Francescantonio based on the course *Advanced AI: Deep Reinforcement Learning* from Udemy. In this notebook we study 2 naive approaches: A random method and a Linear regression with random parameters search.

In [1]:
# Libraries for RL
import gym
from gym import wrappers
import numpy as np

# Libraries for MP4 videos
import io
import base64
from IPython.display import HTML

## Reinforcement Learning  Functions
Definiticion of basic RL functions such as play one episode and play multiple episodes.

In [2]:
# Play one episode
def play_episode(env, decision_function, param = None):
    observation = env.reset()
    done = False
    idx = 0
    while not done:
        # env.render()
        action = decision_function(env,observation,param)
        observation, reward, done, _ = env.step(action)
        idx +=1
    return(idx)

# Play multiple episodes
def play_multiple_episodes(env, decision_function, N, play_video = True, param = None, print_results=True):
    episodes_idx = np.empty(N)
    for i in range(N-1):
        episodes_idx[i] = play_episode(env, decision_function, param)
        
    # Save the video
    if play_video == True:
        env = wrappers.Monitor(env, "./gym-results", force=True)
        episodes_idx[N-1] = play_episode(env, decision_function, param)
    # Dont save the video
    else:
        episodes_idx[N-1] = play_episode(env, decision_function, param)
        
    env.close()
    
    if print_results == True:
        print("Min Duration: ",episodes_idx.min())
        print("Average Duration: ",episodes_idx.mean())
        print("Max Duration: ",episodes_idx.max())
        print("Standard Deviation: ",round(episodes_idx.std(),2))
    
    # Show the video
    if play_video == True:
        video = io.open('./gym-results/openaigym.video.%s.video000000.mp4' % env.file_infix, 'r+b').read()
        encoded = base64.b64encode(video)
        graph = HTML(data='''<video width="980" height="auto" alt="test" controls>
        <source src="data:video/mp4;base64,{0}" type="video/mp4" /></video>'''.format(encoded.decode('ascii')))
        display(graph)
        
    return(episodes_idx.mean())

## CartPole
A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track. The pendulum starts upright, and the goal is to prevent it from falling over by increasing and reducing the cart's velocity.

In [3]:
# Cartpole environment from gym
CartPole = gym.make('CartPole-v0')

In [4]:
# Random decision function
def decision_cartpole_random(env, observation,param=None):
    action = env.action_space.sample()
    return(action)

# Linear Regression decision function
def decision_dot_product_cartpole(env, observation,param):
    dot_product = np.dot(np.array(param),np.array(observation))
    if dot_product > 0:
        return(0)
    else:
        return(1)

# Tunning the parameters of the L.R
def random_search_tuning_cartpole(env,iterations,N):
    best_idx = 0
    best_parameters = None
    for i in range(iterations):
        param = [np.random.random()*2-1 for _ in range(env.observation_space.shape[0])]  
        random_mean = play_multiple_episodes(env = env,decision_function = decision_dot_product_cartpole, N = N, play_video = False,param=param,print_results=False)
        if random_mean > best_idx:
            best_idx = random_mean
            best_parameters = param
    return(best_parameters)


In [5]:
# Random
play_multiple_episodes(env = CartPole,decision_function = decision_cartpole_random, N = 1000)

Min Duration:  8.0
Average Duration:  22.315
Max Duration:  95.0
Standard Deviation:  11.12


22.315

In [6]:
# Random Search: Linear Regression
best_param = random_search_tuning_cartpole(env = CartPole,iterations=200,N=20)
play_multiple_episodes(env = CartPole,decision_function = decision_dot_product_cartpole, N = 1000,param=best_param)

Min Duration:  200.0
Average Duration:  200.0
Max Duration:  200.0
Standard Deviation:  0.0


200.0

## MountainCar
Get an under powered car to the top of a hill (top = 0.5 position).

In [7]:
# MountainCar environment from gym
MountainCar = gym.make('MountainCar-v0')

In [8]:
# Random decision function
def decision_mountaincar_random(env, observation,param=None):
    action = env.action_space.sample()
    return(action)

# Linear Regression decision function
def decision_dot_product_mountaincar(env, observation, param):
    dot_product = np.dot(np.array(param),np.array(observation))
    if dot_product > 0:
        return(0)
    else:
        return(2)

# Tunning the parameters of the L.R
def random_search_tuning_mountaincar(env,iterations,N):
    best_idx = 200
    best_parameters = None
    for i in range(iterations):
        param = [np.random.random()*2-1 for _ in range(env.observation_space.shape[0])]  
        random_mean = play_multiple_episodes(env = env,decision_function = decision_dot_product_mountaincar, N = N, play_video = False,param=param,print_results=False)
        if random_mean < best_idx:
            best_idx = random_mean
            best_parameters = param
    return(best_parameters)

In [9]:
# Random
play_multiple_episodes(env = MountainCar,decision_function = decision_mountaincar_random, N = 1000)

Min Duration:  200.0
Average Duration:  200.0
Max Duration:  200.0
Standard Deviation:  0.0


200.0

In [10]:
# Random Search: Linear Regression
best_param = random_search_tuning_mountaincar(env = MountainCar, iterations=1000, N=25)
play_multiple_episodes(env = MountainCar,decision_function = decision_dot_product_mountaincar, N = 1000,param=best_param)

Min Duration:  113.0
Average Duration:  120.077
Max Duration:  130.0
Standard Deviation:  4.09


120.077