In [1]:
import numpy as np
import gym
import gym_sokoban
import time
from sokoban_env import SokobanEnv

In [2]:
import sys

print(sys.version)

3.9.7 (default, Sep 16 2021, 16:59:28) [MSC v.1916 64 bit (AMD64)]


# Q-learning Implementation

In [3]:
# Create the Sokoban environment
env = SokobanEnv(dim_room=(10, 10), num_boxes=2)

In [4]:
observation = env.reset()
env.render(mode='human')

True

In [5]:
# Action lookup
ACTION_LOOKUP = env.unwrapped.get_action_lookup()
# Convert state to tuple representation (for tabular SARSA)
def state_to_tuple(state):
    return tuple(state.ravel())

In [None]:
# SARSA parameters
num_episodes = 500
learning_rate = 0.5
discount_factor = 0.99
exploration_prob = 0.01

# Q-table initialization
q_table = {}

# Q-learning algorithm
for episode in range(num_episodes):
    state = env.reset()
    done = False
    state_tuple = state_to_tuple(state)
    total_reward = 0

    # Initialize Q-values for the current state if not present
    if state_tuple not in q_table:
        q_table[state_tuple] = np.zeros(env.action_space.n)

    while not done:
        env.render(mode='human')
        time.sleep(0.01)

        # Choose the action based on epsilon-greedy policy
        if np.random.rand() < exploration_prob:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state_tuple])

        # Take the chosen action
        next_state, reward, done, _ = env.step(action)
        next_state_tuple = state_to_tuple(next_state)

        # Initialize Q-values for the next state if not present
        if next_state_tuple not in q_table:
            q_table[next_state_tuple] = np.zeros(env.action_space.n)

        # Q-learning Q-value update
        q_value = q_table[state_tuple][action]
        max_next_q_value = np.max(q_table[next_state_tuple])
        q_table[state_tuple][action] = q_value + learning_rate * (reward + discount_factor * max_next_q_value - q_value)

        state = next_state.copy()  # Copy the next_state into the state variable
        state_tuple = next_state_tuple
        total_reward += reward

        if done:
            print("Episode: {}, Total Reward: {}".format(episode + 1, total_reward))
            break

env.close()

Episode: 1, Total Reward: -40.10000000000006
Episode: 2, Total Reward: -44.10000000000005
Episode: 3, Total Reward: -33.10000000000006
Episode: 4, Total Reward: -77.60000000000005
Episode: 5, Total Reward: -56.300000000000054
Episode: 6, Total Reward: -77.60000000000005
Episode: 7, Total Reward: -51.60000000000006
Episode: 8, Total Reward: -18.600000000000037
Episode: 9, Total Reward: -57.100000000000044
Episode: 10, Total Reward: -57.10000000000005
Episode: 11, Total Reward: -73.30000000000004
Episode: 12, Total Reward: -34.80000000000005
Episode: 13, Total Reward: -57.10000000000005
Episode: 14, Total Reward: -55.600000000000044
Episode: 15, Total Reward: -57.10000000000005
Episode: 16, Total Reward: -58.300000000000054
Episode: 17, Total Reward: -56.30000000000005
Episode: 18, Total Reward: -53.10000000000005
Episode: 19, Total Reward: -73.30000000000005
Episode: 20, Total Reward: -56.10000000000005
Episode: 21, Total Reward: -56.10000000000005
Episode: 22, Total Reward: -52.1000000