In [13]:
#------------------------------------------------------------------------------------------------------------------
#   Card pole example for reinforcement learning
#------------------------------------------------------------------------------------------------------------------
import gymnasium as gym
import numpy as np
import random
from time import sleep
from sklearn.neural_network import MLPRegressor

In [14]:
###### Q-Learning ######
# Create the environment
env = gym.make("Acrobot-v1", render_mode=None)

# Parameters
alpha = 0.1
gamma = 0.6
 
epsilon = 1.0           # Start with high exploration
epsilon_decay = 0.995   # Decay factor for epsilon
min_epsilon = 0.01      # Minimum epsilon value

# Observation and action space dimensions
obs_space = env.observation_space.shape[0]  # The environment has a continuous observation space of size 4
num_actions = env.action_space.n  # Number of available actions (2)

# Neural network
model = MLPRegressor(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=10)

# Initial model training with dummy data
X = np.random.rand(10, obs_space)
y = np.random.rand(10, num_actions)
model.fit(X, y)





In [15]:
# Function to choose action
def choose_action(state, epsilon):
    if np.random.rand() < epsilon:
        return env.action_space.sample()  # Random action

    q_values = model.predict([state])  # Predict Q-values for the given state
    return np.argmax(q_values[0])  # Choose action with the highest Q-value


In [16]:
def calculate_height(state):
    theta1 = np.arctan2(state[1], state[0])  # cos?1, sin?1
    theta2 = np.arctan2(state[3], state[2])  # cos?2, sin?2
    # Calculate the height of the end effector
    height = -np.cos(theta1) - np.cos(theta1 + theta2)
    return height

In [17]:
# Training phase
episodes = 1000
for i in range(episodes):
    state = env.reset()[0]  # Reset the environment and get the initial state
    epochs = 0
    terminated  = False
    truncated = False

    while not terminated and not truncated:

        # Select the next action
        action = choose_action(state, epsilon)

        # Perform the selected action
        next_state, reward, terminated, truncated, info = env.step(action)

        # Penalize if the game is over
        if terminated or truncated:
            reward = 100
        else:
            current_height = calculate_height(state)
            next_height = calculate_height(next_state)

        # Reward based on change in height
        height_gain = next_height - current_height

        reward = height_gain * 100

        # Get current and next Q-values
        q_values = model.predict([state])[0]
        next_q_values = model.predict([next_state])[0]

        # Update the Q-value
        q_values[action] = (1 - alpha) * q_values[action] + alpha * (reward + gamma * np.max(next_q_values))

        # Train the model with the new Q-values
        model.partial_fit([state], [q_values])

        # Update state and count epochs
        state = next_state
        epochs += 1
        
    epsilon = max(min_epsilon, epsilon * epsilon_decay)  
    # Show training status
    print(f"Episode: {i}, epochs: {epochs}")

print("Training finished.\n")


Episode: 0, epochs: 500
Episode: 1, epochs: 500
Episode: 2, epochs: 500
Episode: 3, epochs: 500
Episode: 4, epochs: 500
Episode: 5, epochs: 500
Episode: 6, epochs: 500
Episode: 7, epochs: 500
Episode: 8, epochs: 500
Episode: 9, epochs: 500
Episode: 10, epochs: 500
Episode: 11, epochs: 500
Episode: 12, epochs: 500
Episode: 13, epochs: 500
Episode: 14, epochs: 500
Episode: 15, epochs: 500
Episode: 16, epochs: 500
Episode: 17, epochs: 500
Episode: 18, epochs: 500
Episode: 19, epochs: 500
Episode: 20, epochs: 500
Episode: 21, epochs: 461
Episode: 22, epochs: 500
Episode: 23, epochs: 500
Episode: 24, epochs: 500
Episode: 25, epochs: 500
Episode: 26, epochs: 402
Episode: 27, epochs: 415
Episode: 28, epochs: 500
Episode: 29, epochs: 500
Episode: 30, epochs: 500
Episode: 31, epochs: 500
Episode: 32, epochs: 500
Episode: 33, epochs: 500
Episode: 34, epochs: 500
Episode: 35, epochs: 500
Episode: 36, epochs: 500
Episode: 37, epochs: 500
Episode: 38, epochs: 500
Episode: 39, epochs: 345
Episode: 4

In [18]:
# Execution phase
env = gym.make("Acrobot-v1", render_mode='human').env

for k in range(10):
    state = env.reset()[0]
    env.render()

    epochs = 0
    terminated  = False
    truncated = False

    while not terminated and not truncated: 

        # Select the action with the best Q-value
        action = choose_action(state, epsilon=0.0)

        # Perform the selected action
        state, reward, terminated, truncated, info = env.step(action)        
    
        # Show current state
        env.render()       
        print(f'State: {state}  Action: {action}  Reward: {reward}')

        # Count epochs
        epochs += 1
        

        sleep(.02)
        epsilon = max(min_epsilon, epsilon * epsilon_decay) 
    
    print(f"Timesteps taken: {epochs}")

env.close()

State: [ 0.9998897   0.01485289  0.99841696  0.05624565 -0.1393924   0.26314676]  Action: 2  Reward: -1.0
State: [ 0.999674   -0.0255324   0.9907726   0.13553447 -0.25356245  0.51311684]  Action: 2  Reward: -1.0
State: [ 0.9967182  -0.08094938  0.96865624  0.24840504 -0.28651723  0.6091096 ]  Action: 2  Reward: -1.0
State: [ 0.99104935 -0.13349588  0.93319434  0.35937214 -0.22816539  0.52950776]  Action: 2  Reward: -1.0
State: [ 0.9860177  -0.16664053  0.8990428   0.43786076 -0.0985437   0.3090892 ]  Action: 2  Reward: -1.0
State: [ 0.98538977 -0.1703145   0.88405293  0.4673868   0.06243928  0.01710454]  Action: 2  Reward: -1.0
State: [ 0.98973256 -0.1429317   0.8958132   0.4444307   0.2085331  -0.26704127]  Action: 2  Reward: -1.0
State: [ 0.99580836 -0.09146454  0.92674345  0.37569487  0.29746643 -0.46752968]  Action: 2  Reward: -1.0
State: [ 0.99954146 -0.03027976  0.96017426  0.27940175  0.30039445 -0.5261428 ]  Action: 2  Reward: -1.0
State: [ 0.9997507   0.02232914  0.98280597  0

KeyboardInterrupt: 

: 