In [None]:
import numpy as np 
from game import env
import random
from tqdm import tqdm  # modified import

### Graphics


In [None]:

# Initializing the structure
def make_matrix(x):
    return [[" 0 " for i in range(x)] for i in range(x)]


# UI related functions
def display(matrix):
    for i in range(len(matrix)):
        for j in range(len(matrix)):
            print(matrix[i][j], end=" ")
        print()

def space():
    print("\n"*5)

### Sprite logic


In [None]:
from random import randint

class sprite():
    
    def __init__(self, char, matrix, type):
        if type == "checkpoint":
            self.x = len(matrix)-1
            self.y = len(matrix)-1
        else:
            self.x = randint(0,len(matrix)-1)
            self.y = randint(0,len(matrix)-2)
        self.char = char
        self.background = matrix

    def move_up(self):
        if self.y - 1 in range(len(self.background)):
            self.y -=1

    def move_down(self):
        if self.y + 1 in range(len(self.background)):
            self.y +=1

    def move_right(self):
        if self.x + 1 in range(len(self.background)):
            self.x +=1

    def move_left(self):
        if self.x - 1 in range(len(self.background)):
            self.x -=1
    
    def set_position(self):
        self.background[self.y][self.x] = self.char


### Game class

In [None]:
import time
class env():
    def __init__(self):
        self.matrix = make_matrix(5)
        self.character = sprite(" ^ ", self.matrix, "")
        self.checkpoint = sprite(" X ", self.matrix, "checkpoint")
        self.character.set_position()
        self.checkpoint.set_position()
        self.distance = (self.character.x - self.checkpoint.x)**2 + (self.character.y - self.checkpoint.y)**2
        
        self.score = 0
        self.steps = 0

    def get_state(self):
        self.state =   self.character.x + self.character.y * len(self.matrix)
        return self.state
    
    def reset(self):
        self.steps = 0
        self.character.set_position()
        self.checkpoint.set_position()
        return self.get_state()

    def check_coincide(self):
        return self.character.x == self.checkpoint.x and self.character.y == self.checkpoint.y
    
    def update_matrix(self):
        for i in range(len(self.matrix)):
            for j in range(len(self.matrix)):
                if self.matrix[i][j] == " ^ ":
                    self.matrix[i][j] = " 0 "
        self.matrix[self.character.y][self.character.x] = " ^ "
    def check_coincide(self):
        return self.character.x == self.checkpoint.x and self.character.y == self.checkpoint.y
    
    def step(self, action):
        if action == 0:
            self.character.move_up()
        elif action == 1:
            self.character.move_down()
        elif action == 2:
            self.character.move_left()
        elif action == 3:
            self.character.move_right()

        new_distance = (self.character.x - self.checkpoint.x)**2 + (self.character.y - self.checkpoint.y)**2
        if self.check_coincide():
            reward = 10
            done = True
        elif new_distance < self.distance:
            reward = 1
            done = False
        else:
            reward = -1
            done = False

        self.state = self.get_state()
        self.update_matrix()
        return self.matrix, reward, done
    
    def render(self):
        display(self.matrix)
        self.steps += 1
        if self.steps > 10:
            done = True
        else:
            done = False
        return done
    
    def possible_states(self):
        return len(self.matrix)**2


In [None]:
# Training parameters
n_training_episodes = 10000  # Total training episodes
learning_rate = 0.7          # Learning rate

# Evaluation parameters
n_eval_episodes = 100        # Total number of test episodes

# Environment parameters
env_id = "FrozenLake-v1"     # Name of the environment
max_steps = 99               # Max steps per episode
gamma = 0.95                 # Discounting rate
eval_seed = []               # The evaluation seed of the environment

# Exploration parameters
max_epsilon = 1.0             # Exploration probability at start
min_epsilon = 0.05            # Minimum exploration probability
decay_rate = 0.0005            # Exponential decay rate for exploration prob

In [None]:
game = env()
state_space = len(game.matrix)**2 # Make this the number of states in your environment
action_space = 4

# AI part


In [None]:
def initialize_q_table(state_space, action_space):
  Qtable = np.zeros((state_space,action_space))
  return Qtable

In [None]:
def greedy_policy(Qtable, state):
  # Exploitation: take the action with the highest state, action value
  action = np.argmax(Qtable[state][:])
  return action

In [None]:
def epsilon_greedy_policy(Qtable, state, epsilon):
  # Randomly generate a number between 0 and 1
  random_num = random.uniform(0,1)
  # if random_num > greater than epsilon --> exploitation
  if random_num > epsilon:
    # Take the action with the highest value given a state
    # np.argmax can be useful here
    action = np.argmax(Qtable[state][:])
  # else --> exploration
  else:
    action = random.randint(0,3)# Take a random action

  return action

In [None]:
def train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, env, max_steps, Qtable):
  for episode in tqdm(range(n_training_episodes)):
    # Reduce epsilon (because we need less and less exploration)
    epsilon = min_epsilon + (max_epsilon - min_epsilon)*np.exp(-decay_rate*episode)
    # Reset the environment
    state = env.reset()
    terminated = False

    # repeat
    for step in range(max_steps):
      # Choose the action At using epsilon greedy policy
      action = epsilon_greedy_policy(Qtable, state, epsilon)

      # Take action At and observe Rt+1 and St+1
      # Take the action (a) and observe the outcome state(s') and reward (r)
      matrix, reward, terminated = env.step(action)
      new_state = env.get_state()
      env.matrix = matrix
      env.render()
      # Update Q(s,a):= Q(s,a) + lr [R(s,a) + gamma * max Q(s',a') - Q(s,a)]
      Qtable[state][action] = Qtable[state][action] + learning_rate * (reward + gamma * np.argmax(Qtable[new_state] - Qtable[state]))

      # If terminated or truncated finish the episode
      if terminated:
        break

      # Our next state is the new state
      state = new_state
  return Qtable

In [None]:
Qtable = initialize_q_table(state_space, action_space)

### Training

In [None]:
Qtable = train(n_training_episodes, min_epsilon, max_epsilon, decay_rate, game, max_steps, Qtable)

In [None]:
Qtable

### Testing

In [176]:
state = game.reset()
terminated = False

game.render()
# for step in range(max_steps):
#     game.render()
#     time.sleep(0.5)
#     # Choose the action At using epsilon greedy policy
#     action = greedy_policy(Qtable, state)
    
#     # Take action At and observe Rt+1 and St+1
#     # Take the action (a) and observe the outcome state(s') and reward (r)
#     matrix, reward, terminated = game.step(action)
#     new_state = game.get_state()
#     game.matrix = matrix
    
#     # If terminated finish the game
#     if terminated:
#         break
    
#     # Our next state is the new state
#     state = new_state

 0   0   0   0   0  
 0   0   0   0   0  
 0   0   0   0   0  
 0   0   0   0   0  
 0   0   0   0   X  


False