In [46]:
import os
os.getcwd()

'/home/tiger_cheerag/DQN'

`Environment` is the surrounding or setting where the agent performs actions. The agent interacts with the environment and the state of the environment changes

## *Custom Environment for Tic-tac-toe*

Instead of two players, the simplified Tic-tac-toe has only `one player`. The player chooses positions randomly and if the position s/he chooses has already been chosen, the game ends.

In [3]:
import tensorflow as tf
import numpy as np
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts

Environments can be either Python environment or TensorFlow environment. Python environments are simple to implement but TensorFlow environments are more efficient

Python environment and use one of our wrappers to automatically convert it to the TensorFlow environment.

In [42]:
class SimplifiedTicTacToe(py_environment.PyEnvironment):
    '''
    action_spec: describes the specifications (TensorSpecs) of the action expected by step
    
    observation_spec: defines the specifications (TensorSpec) of observations provided by the environment
    
    _reset: returns the current situation (TimeStep) after resetting the environment
    
    _step: applies the action and returns the new situation (TimeStep)
    
    '''
    def __init__(self):
        self._action_spec = array_spec.BoundedArraySpec(
            shape=(), dtype=np.int32, minimum=0, maximum=8, name='play')
        
        self._observation_spec = array_spec.BoundedArraySpec(
            shape=(1,9), dtype=np.int32, minimum=0, maximum=1, name='board')
        
        self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        
        self._episode_ended = False
    
    def action_spec(self):
        return self._action_spec
    
    def observation_spec(self):
        return self._observation_spec
    
    def _reset(self):
          # state at the start of the game
        self._state = [0, 0, 0, 0, 0, 0, 0, 0, 0]
        self._episode_ended = False
        return ts.restart(np.array([self._state], dtype=np.int32))

    #checking if some spot is empty and if all the spots are occupied.
    def __is_spot_empty(self, ind):
        return self._state[ind] == 0

    def __all_spots_occupied(self):
        return all(i == 1 for i in self._state)
    
    
    def _step(self, action): 
        '''
        It applies the action and returns the new situation in the game.
        The situation is of the class TimeStep in TensorFlow.
        TimeStep(step_type, reward, discount, observation)
        
        '''
        
        if self._episode_ended:
            return self._reset()

        if self.__is_spot_empty(action):        
            self._state[action] = 1

            if self.__all_spots_occupied():
                self._episode_ended = True
                return ts.termination(np.array([self._state], dtype=np.int32), 1)
            else:
                return ts.transition(np.array([self._state], dtype=np.int32), reward=0.05, discount=1.0)

        else:
            self._episode_ended = True
            return ts.termination(np.array([self._state], dtype=np.int32), -1)

In [43]:
python_environment = SimplifiedTicTacToe()
tf_env = tf_py_environment.TFPyEnvironment(python_environment)

In [44]:
time_step = tf_env.reset()
rewards = []
steps = []
number_of_episodes = 10000

for _ in range(number_of_episodes):
    episode_steps = 0
    episode_reward = 0
    tf_env.reset()
    while True:
        action = tf.random.uniform([1], 0, 9, dtype=tf.int32)
        next_time_step = tf_env.step(action)
        if tf_env.current_time_step().is_last():
            break
        episode_steps += 1
        episode_reward += next_time_step.reward.numpy()
    #print(episode_steps)    
    rewards.append(episode_reward)
    steps.append(episode_steps)
    
# for _ in range(num_episodes):
#     episode_reward = 0
#     episode_steps = 0
#     tf_env.reset()
#     while not tf_env.current_time_step().is_last():
#         action = tf.random_uniform([1], 0, 9, dtype=tf.int32)
#         next_time_step = tf_env.step(action)
#         episode_steps += 1
#         episode_reward += next_time_step.reward.numpy()
#     rewards.append(episode_reward)
#     steps.append(episode_steps)

In [45]:
mean_no_of_steps = np.mean(steps)
mean_no_of_steps

3.4569