<a href="https://colab.research.google.com/github/Aravind-11/IITM_Saastra/blob/main/multi_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import gym

class Football:  # The class encapsulating the environment
    '''
    Actions [0 : Stand, 1 : Up, 2 : Right, 3 : Down, 4 : Left]
    These are the representing no.s for the mentioned actions
    '''

    def __init__(self, length=8, width=8, goalPositions=[4, 4]):
        
        # The player start at random locations
        
        self.pA=np.array([np.random.randint(length), np.random.randint(length)]) 
        self.pB=np.array([np.random.randint(length), np.random.randint(length)]) 
        
        self.h = length   # Length of the Football Pitch    
        self.w = width    # Width of the Football Pitch
        
        self.goalPositions = np.array(goalPositions)   # This means that the middle 4 positions at the right and left are the goals
        
        self.reward = np.array([0,0])
        
                                  # Initially the reward is 0
        
        self.observation=np.random.rand(5,)
        self.done = bool(0)    
        self.observation_space=gym.spaces.Box(low=-8, high=8,
                                        shape=(5,), dtype=np.float32)
        self.ballOwner = np.random.randint(2)
        self.action_space=gym.spaces.Discrete(5)
    

    def isInBoard(self, x, y):
        if(x<0 or x>8):
          return 0
        if(y<0 or y>8):
          return 0 
        return 1
    
    def actionToMove(self, action):
        switcher = {
            0: [0, 0],
            1: [0, 1],
            2: [1, 0],
            3: [0, -1],
            4: [-1, 0],
        }
        return switcher.get(action)

In [24]:
class Agent_A(Football,gym.Env):
  def __init__(self, length=8, width=8, goalPositions=[2, 6]):
    super().__init__()
    
    
  def reset(self):
        self.done = bool(0)
        
        self.pA = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        self.pB = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        return np.array((*self.pA,*self.pB,(4-self.pA[0]),(4-self.pA[1]))).astype(np.float32)
  def step(self, action):
        if self.done == 1:
          self.reset()
        self.move(action)                   # We chose the first player at random !!! important thing to consider - how to choose first player . 
        if self.done == 1:
          return self.observation, self.reward[0], self.done

        return self.observation, self.reward[0].astype(float), self.done,{}
  
  def move(self, action):
        
        newPosition = self.pA + self.actionToMove(action)

        # If it's opponent position
        if (newPosition == self.pB).any():
            self.ballOwner = 1
            self.reward[0]=-20
            self.reward[1]=20
        # If it's a goal
        elif self.ballOwner is 0 and self.isInGoal(*newPosition) >= 0:
            self.done = 1
            return 1 - self.isInGoal(*newPosition)
        # If it's in the board
        elif self.isInBoard(*newPosition):
            self.reward[0] =  -0.1 * ((((abs(newPosition[0]-self.pB[0])+0.1)+(abs(newPosition[1]-self.pB[1])+0.1))) -  (self.ballOwner) )
            self.pA = newPosition
        self.observation=np.array((*self.pA,*self.pB,self.ballOwner)).astype(np.float32)
        return -1
  def reset(self):
        self.done = bool(0)
        self.reward = np.array([0,0])
        
        self.pA = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        self.pB = np.array([np.random.randint(self.h), np.random.randint(self.h)]) 
        return np.array((*self.pA,*self.pB,self.ballOwner)).astype(np.float32)
  def render(self,mode='console'):
        board = ''
        for y in range(self.h)[::-1]:
            for x in range(self.w):
                if ([x, y] == self.pA).all():
                    board += 'A' if self.ballOwner is 0 else 'a'
                elif ([x, y] == self.pB).all():
                    board += 'B' if self.ballOwner is 1 else 'b'
                else:
                    board += '-'
            board += '\n'

  def isInGoal(self, x, y):
        g1, g2 = self.goalPositions
        if (g1 <= y <= g2):
            if x == 0:
                self.done = bool(1)
                self.reward[0] = -20 # if the ball reaches the right goal post, then the rewards shall be -1
                return 1 
            elif x == (self.w-1):
                self.done = bool(1)
                self.reward[0] = 20 # if the ball reaches the right goal post, then the rewards shall be 1
                return 0
        return -1

  def seed():
      return 0 
  def metadata(x):
      return 0 
  def legal_actions(self):
    return gym.spaces.Discrete(5)
  def close(self):
    pass

In [25]:
env1=Agent_A(Football,gym.Env)

In [26]:
class Agent_B(Football,gym.Env):
  def __init__(self, length=8, width=8, goalPositions=[2, 6]):
    super().__init__()
    
    
  def reset(self):
        self.done = bool(0)
        
        self.pA = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        self.pB = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        return np.array((*self.pB,*self.pA,self.ballOwner)).astype(np.float32)
  def step(self, action):
        if self.done == 1:
          self.reset()
        self.move(action)                   # We chose the first player at random !!! important thing to consider - how to choose first player . 
        if self.done == 1:
          return self.observation, self.reward[1], self.done

        return self.observation, self.reward[1].astype(np.float), self.done,{}
  
  def move(self, action):
        
        newPosition = self.pB + self.actionToMove(action)

        # If it's opponent position
        if (newPosition == self.pA).any():
            self.ballOwner = 0
            self.reward[1]=-20
            self.reward[0]=20
        # If it's a goal
        elif self.ballOwner is 1 and self.isInGoal(*newPosition) >= 0:
            self.done = 1
            return 1 - self.isInGoal(*newPosition)
        # If it's in the board
        elif self.isInBoard(*newPosition):
            self.reward[1] =  -0.1 * ((((abs(newPosition[0]-self.pA[0])+0.1)+(abs(newPosition[1]-self.pA[1])+0.1))) + (self.ballOwner) )
            self.pB = newPosition
        self.observation=np.array((*self.pB,*self.pA,self.ballOwner)).astype(np.float32)
        return -1
  def reset(self):
        self.done = bool(0)
        self.reward = np.array([0,0])

        self.pA = np.array([np.random.randint(self.h), np.random.randint(self.h)])
        self.pB = np.array([np.random.randint(self.h), np.random.randint(self.h)]) 
        return np.array((*self.pB,*self.pA,self.ballOwner)).astype(np.float32)
  def render(self,mode='console'):
        board = ''
        for y in range(self.h)[::-1]:
            for x in range(self.w):
                if ([x, y] == self.pA).all():
                    board += 'A' if ballOwner is 0 else 'a'
                elif ([x, y] == self.pB).all():
                    board += 'B' if ballOwner is 1 else 'b'
                else:
                    board += '-'
            board += '\n'
  def isInGoal(self, x, y):
        g1, g2 = self.goalPositions
        if (g1 <= y <= g2):
            if x == 0:
                self.done = bool(1)
                self.reward[1] = -20 # if the ball reaches the right goal post, then the rewards shall be -1
                return 1 
            elif x == (self.w-1):
                self.done = bool(1)
                self.reward[1] = 20 # if the ball reaches the right goal post, then the rewards shall be 1
                return 0
        return -1
  def seed():
      return 0 
  def metadata(x):
      return 0 
  def legal_actions(self):
    return gym.spaces.Discrete(5)
  def close(self):
    pass

In [15]:
env2=Agent_B(Football,gym.Env)

In [6]:
# Stable Baselines only supports tensorflow 1.x for now
%tensorflow_version 1.x
!pip install stable-baselines[mpi]==2.10.0

TensorFlow 1.x selected.
Collecting stable-baselines[mpi]==2.10.0
  Downloading stable_baselines-2.10.0-py3-none-any.whl (248 kB)
[K     |████████████████████████████████| 248 kB 5.5 MB/s 
Installing collected packages: stable-baselines
  Attempting uninstall: stable-baselines
    Found existing installation: stable-baselines 2.2.1
    Uninstalling stable-baselines-2.2.1:
      Successfully uninstalled stable-baselines-2.2.1
Successfully installed stable-baselines-2.10.0


In [None]:
from stable_baselines.common.env_checker import check_env

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [None]:
check_env(env, warn=True)

In [7]:
from stable_baselines import DQN, PPO2, A2C, ACKTR
from stable_baselines.common.cmd_util import make_vec_env

# Instantiate the env
#env = GoLeftEnv(grid_size=10)
# wrap it
env1 = make_vec_env(lambda: env1, n_envs=1)
env2 = make_vec_env(lambda: env2, n_envs=1)

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [27]:
# Train the agent

model1 = PPO2('MlpPolicy', env1, verbose=1).learn(5000)
model2 = PPO2('MlpPolicy', env2, verbose=1).learn(5000)

Wrapping the env in a DummyVecEnv.
--------------------------------------
| approxkl           | 0.0003761745  |
| clipfrac           | 0.0           |
| explained_variance | -0.00265      |
| fps                | 348           |
| n_updates          | 1             |
| policy_entropy     | 1.6090072     |
| policy_loss        | -0.0042407834 |
| serial_timesteps   | 128           |
| time_elapsed       | 1.6e-05       |
| total_timesteps    | 128           |
| value_loss         | 232.13898     |
--------------------------------------
-------------------------------------
| approxkl           | 0.00045641   |
| clipfrac           | 0.0          |
| explained_variance | -0.00419     |
| fps                | 851          |
| n_updates          | 2            |
| policy_entropy     | 1.6057125    |
| policy_loss        | -0.006642286 |
| serial_timesteps   | 256          |
| time_elapsed       | 0.368        |
| total_timesteps    | 256          |
| value_loss         | 543.44415    |
--

In [28]:
##VALIDATION 
obs1 = env1.reset()
obs2=env2.reset()
n_steps = 20
for step in range(n_steps):
  action_a, _ = model1.predict(obs1, deterministic=True)
  action_b,_ = model2.predict(obs2, deterministic=True)
  print("Step {}".format(step + 1))
  print("Action:a", action_a)
  print("Action:b",action_b)
  obs1, reward1, done, info = env1.step(action_a)
  obs2, reward2, done, info = env2.step(action_b)
  print('obs a =', obs1, 'reward a =', reward1, 'done=', done)
  print('obs b =', obs2, 'reward b =', reward2, 'done=', done)
  env1.render(mode='console')
  if done:
    # Note that the VecEnv resets automatically
    # when a done signal is encountered
    print("Goal reached!", "reward_a=", reward1)
    print("Goal reached!","reward_b=",reward2)
    break

Step 1
Action:a 3
Action:b 1
obs a = [0. 3. 7. 4. 1.] reward a = 0.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 2
Action:a 3
Action:b 1
obs a = [0. 2. 7. 4. 1.] reward a = 0.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 3
Action:a 3
Action:b 1
obs a = [0. 1. 7. 4. 1.] reward a = 0.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 4
Action:a 3
Action:b 1
obs a = [0. 0. 7. 4. 1.] reward a = -1.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 5
Action:a 3
Action:b 1
obs a = [0. 0. 7. 4. 1.] reward a = -1.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 6
Action:a 3
Action:b 1
obs a = [0. 0. 7. 4. 1.] reward a = -1.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 7
Action:a 3
Action:b 1
obs a = [0. 0. 7. 4. 1.] reward a = -1.0 done= False
obs b = [4. 2. 4. 3. 0.] reward b = -20.0 done= False
Step 8
Action:a 3
Action:b 1
obs a = [0. 0. 7. 4. 1