In [2]:
# !pip install gym
# !pip install pygame
import gym
import time
from random import randint

Collecting gym
[?25l  Downloading https://files.pythonhosted.org/packages/34/e8/c8953e7fb2e3b3a232a21f87248b87fe9354b8db74e79ece99f53ce31a3d/gym-0.24.0.tar.gz (694kB)
[K     |████████████████████████████████| 696kB 4.1MB/s eta 0:00:01
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h    Preparing wheel metadata ... [?25ldone
Collecting gym-notices>=0.0.4 (from gym)
  Downloading https://files.pythonhosted.org/packages/81/7d/1facd00576b34597a8caa3ba59e3e576c34ba8210fe7c28ac0c270d19070/gym_notices-0.0.7-py3-none-any.whl
Collecting cloudpickle>=1.2.0 (from gym)
  Downloading https://files.pythonhosted.org/packages/25/40/2c9db9cfb85a8a21c61528f6660c47662b3e59576efac610d8268d47abba/cloudpickle-2.1.0-py3-none-any.whl
Collecting importlib-metadata>=4.8.0; python_version < "3.10" (from gym)
  Downloading https://files.pythonhosted.org/packages/ab/b5/1bd220dd470b0b912fc31499e0d9c652007a60caf137995867ccc4b98cb6/importlib_met

# Cargo el entorno

In [3]:
env = gym.make('CartPole-v1')

# Simulación

In [5]:
for _ in range(10):
    obs = env.reset()
    done = False
    while not done:
        policy = randint(0,1)
        obs, reward, done, info = env.step(policy)
        # print(obs)
        # env.render()
        time.sleep(0.05)
        if done:
            env.reset()
env.close()

In [9]:
import numpy as np
def createEpsilonGreedyPolicy(Q, epsilon, num_actions):
    """
    Creates an epsilon-greedy policy based
    on a given Q-function and epsilon.
       
    Returns a function that takes the state
    as an input and returns the probabilities
    for each action in the form of a numpy array 
    of length of the action space(set of possible actions).
    """
    def policyFunction(state):
   
        Action_probabilities = np.ones(num_actions,
                dtype = float) * epsilon / num_actions
    
        state_tup = state[0] , state[1] , state[2] , state[3] 
        
        best_action = np.argmax(Q[state_tup])
        Action_probabilities[best_action] += (1.0 - epsilon)
        return Action_probabilities
   
    return policyFunction

In [7]:
from collections import defaultdict
import gym
import itertools
import matplotlib
import matplotlib.style
import numpy as np
import pandas as pd
import sys

def qLearning(env, num_episodes, discount_factor = 1.0,
                            alpha = 0.6, epsilon = 0.1):
    """
    Q-Learning algorithm: Off-policy TD control.
    Finds the optimal greedy policy while improving
    following an epsilon-greedy policy"""
       
    # Action value function
    # A nested dictionary that maps
    # state -> (action -> action-value).
    Q = defaultdict(lambda: np.zeros(env.action_space.n))
       
    # Create an epsilon greedy policy function
    # appropriately for environment action space
    policy = createEpsilonGreedyPolicy(Q, epsilon, env.action_space.n)
       
    # For every episode
    for ith_episode in range(num_episodes):
           
        # Reset the environment and pick the first action
        state = env.reset()
           
        for t in itertools.count():
               
            # get probabilities of all actions from current state
            action_probabilities = policy(state)
   
            # choose action according to 
            # the probability distribution
            action = np.random.choice(np.arange(
                      len(action_probabilities)),
                       p = action_probabilities)
            
   
            # take action and get reward, transit to next state
            next_state, reward, done, _ = env.step(action)
           
            state_tup = state[0] , state[1] , state[2] , state[3] 
            sext_state_tup = next_state[0] , next_state[1] , next_state[2] , next_state[3] 
               
            # TD Update
            best_next_action = np.argmax(Q[sext_state_tup])    
            td_target = reward + discount_factor * Q[sext_state_tup][best_next_action]
            td_delta = td_target - Q[state_tup][action]
            Q[state_tup][action] += alpha * td_delta
   
            # done is True if episode terminated   
            if done:
                break
                   
            state = next_state
       
    return Q



In [10]:
Q = qLearning(env, 1000)