# Logica del Juego

In [144]:
import numpy as np
import gym
import random
from termcolor import colored, cprint
import time
import math
from IPython.display import clear_output
from matplotlib import pyplot as plot

In [145]:
# 0 = empty_space; 1 = player; 2 = goal 
def draw_map(size, player_position, goal_position):
    map = np.zeros((size[1], size[0]), dtype=np.int)
    map[player_position[1], player_position[0]] = 1
    map[goal_position[1], goal_position[0]] = 2
    
    return map

In [146]:

class GameEnviroment:
    def __init__(self, map, size, player_initial_state, goal_position):
        self.map = map
        self.size = size
        self.player_state = player_initial_state
        self.goal = goal_position 
        self.rewards = {
            "walk": -1,
            "fall": -5,
            "goal": 20
        }

    def __str__(self):
        return str(self.map)
    
    def reset(self, initial_map, player_initial_state):
        self.map = initial_map
        self.player_state = player_initial_state
        return ((10 * self.player_state[0]) + self.player_state[1])

    def is_in_goal(self):
        return (self.player_state[0] == self.goal[0] and self.player_state[1] == self.goal[1])
    
    def has_fallen_of_map(self):
        x_range = np.array(range(self.size[0]))
        y_range = np.array(range(self.size[1]))
        return (self.player_state[1] not in y_range or self.player_state[0] not in x_range)

    def move_player_left(self):
        self.player_state[0] = self.player_state[0] - 1
        if self.is_in_goal():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["goal"], True
        if self.has_fallen_of_map():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["fall"], True
        self.map = draw_map(self.size, self.player_state, self.goal)
        return (10*self.player_state[0] + self.player_state[1]), self.rewards["walk"], False
    
    def move_player_right(self):
        self.player_state[0] = self.player_state[0] + 1
        if self.is_in_goal():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["goal"], True
        if self.has_fallen_of_map():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["fall"], True
        self.map = draw_map(self.size, self.player_state, self.goal)
        return (10*self.player_state[0] + self.player_state[1]), self.rewards["walk"], False

    def move_player_up(self):
        self.player_state[1] = self.player_state[1] - 1
        if self.is_in_goal():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["goal"], True
        if self.has_fallen_of_map():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["fall"], True
        self.map = draw_map(self.size, self.player_state, self.goal)
        return (10*self.player_state[0] + self.player_state[1]), self.rewards["walk"], False
    
    def move_player_down(self):
        self.player_state[1] = self.player_state[1] + 1
        if self.is_in_goal():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["goal"], True
        if self.has_fallen_of_map():
            return (10*self.player_state[0] + self.player_state[1]), self.rewards["fall"], True
        self.map = draw_map(self.size, self.player_state, self.goal)
        return (10*self.player_state[0] + self.player_state[1]), self.rewards["walk"], False
        
    # 0 = LEFT, 1 = RIGHT, 2 = UP, 3 = DOWN
    def step(self, action):
        if action == 0:
            new_state, reward, done = self.move_player_left()
        if action == 1:
            new_state, reward, done = self.move_player_right()
        if action == 2:
            new_state, reward, done = self.move_player_up()
        if action == 3:
            new_state, reward, done = self.move_player_down()

        # applying wind
        PrA = 0.1
        PrB = 0.2
        PrC = 0.15
        if self.player_state[0] == 3:
            random_number = np.random.uniform(low=0.0, high=1.0)
            if random_number <= PrA:
                new_state, reward, done = self.move_player_up()
        
        if self.player_state[0] == 4:
            random_number = np.random.uniform(low=0.0, high=1.0)
            if random_number <= PrB:
                new_state, reward, done = self.move_player_up()
                new_state, reward, done = self.move_player_up()

        if self.player_state[0] == 5:
            random_number = np.random.uniform(low=0.0, high=1.0)
            if random_number <= PrC:
                new_state, reward, done = self.move_player_up()
        
        return new_state, reward, done

In [147]:
map_size = [9, 7]
player_initial_pos = [1, 1]
goal_pos = [7, 4]

initial_map = draw_map(map_size, player_initial_pos, goal_pos)
env = GameEnviroment(initial_map, map_size, player_initial_pos, goal_pos)
print(env)

[[0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 2 0]
 [0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0]]


Inicializando parametros 

In [148]:
num_episodes = 15000
max_steps_per_episode = 100

learning_rate = 0.2
discount_rate = 0.95

rewards_avg = []

action_space_size = 4
state_space_size = 200

q_table = np.zeros((state_space_size, action_space_size))

Corriendo el algoritmo Q-Learning

In [149]:
# This cycle is to calculate the average reward/episodes and its only purpose is to plot the nice graph below that
# shows how the agent learn how to maximize the reward.
for it in range(100):
    print('average ', it)
    rewards_all_episodes=[]
    
    # exporation-exploitation trade-off params
    exploration_rate = 1
    max_exploration_rate = 1
    min_exploration_rate = 0.01
    exploration_decay_rate = 0.005
    
    # init q table in zeros
    q_table = np.zeros((state_space_size, action_space_size))

    # iterate over the episodes
    for episode in range(num_episodes):
        state = env.reset(initial_map, player_initial_pos)
        done = False
        rewards_current_episode = 0
        
        # iterate over the steps for an episode
        for step in range(max_steps_per_episode):
            # Exploration-exploitation trade-off
            exploration_rate_threshold = np.random.uniform(low=0.0, high=1.0)
            if exploration_rate_threshold <= exploration_rate:
                # Exploration time
                action = np.random.randint(0, action_space_size)
            else:
                # Explotation time
                action = np.argmax(q_table[state])

            # Take action
            new_state, reward, done = env.step(action)

            # Update Q-table for Q(s,a)
            print(f"state: {state}, action: {action}, new state: {new_state}, reward: {reward}, done?: {done}")
            if done == True: 
                break
            q_table[state, action] = (1 - learning_rate) * q_table[state, action] + learning_rate * (reward + discount_rate * np.max(q_table[new_state]))
            # transition next state

            state = new_state
            rewards_current_episode += reward
            
            


        # Exploration rate decay
        exploration_rate = min_exploration_rate + (max_exploration_rate - min_exploration_rate) * (math.e ** (-exploration_decay_rate * episode))

        rewards_all_episodes.append(rewards_current_episode)
    rewards_avg.append(rewards_all_episodes)

average  0
state: 11, action: 1, new state: 21, reward: -1, done?: False
state: 21, action: 1, new state: 31, reward: -1, done?: False
state: 31, action: 1, new state: 41, reward: -1, done?: False
state: 41, action: 3, new state: 42, reward: -1, done?: False
state: 42, action: 0, new state: 32, reward: -1, done?: False
state: 32, action: 3, new state: 32, reward: -1, done?: False
state: 32, action: 3, new state: 33, reward: -1, done?: False
state: 33, action: 0, new state: 23, reward: -1, done?: False
state: 23, action: 3, new state: 24, reward: -1, done?: False
state: 24, action: 2, new state: 23, reward: -1, done?: False
state: 23, action: 0, new state: 13, reward: -1, done?: False
state: 13, action: 2, new state: 12, reward: -1, done?: False
state: 12, action: 3, new state: 13, reward: -1, done?: False
state: 13, action: 1, new state: 23, reward: -1, done?: False
state: 23, action: 3, new state: 24, reward: -1, done?: False
state: 24, action: 3, new state: 25, reward: -1, done?: Fal

IndexError: index -204 is out of bounds for axis 0 with size 200

In [None]:
x = [i for i in range(0,num_episodes)]
y = np.mean(rewards_avg, axis=0)
plot.xlabel('Episodes')
plot.ylabel('Reward')
plot.plot(x, y,'o')