1. Import Dependencies

In [1]:
import os
from stable_baselines3 import PPO
from stable_baselines3.common.evaluation import evaluate_policy
from stable_baselines3.common.env_checker import check_env

# Building an Environment

Check Training Environment

In [2]:
import gym
from gym import Env
from gym.spaces import Discrete, Dict, MultiBinary, MultiDiscrete
import numpy as np
import random

class DeterministicScenario:
    def getActionSpace(self):
        return MultiDiscrete([6, 6, 2, 2])
    def getObservationSpace(self):
        return Dict({"missiles": Discrete(100), 
        "expectedShipDamage": MultiDiscrete([100,100,100,100,100,100]), 
        "currentShipDamage": MultiDiscrete([100,100,100,100,100,100]), 
        "target1Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target2Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target3Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target4Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target5Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target6Defense": MultiDiscrete([100,100,100,100,100,100]),
        "target1Targets": MultiBinary(6),
        "target2Targets": MultiBinary(6),
        "target3Targets": MultiBinary(6),
        "target4Targets": MultiBinary(6),
        "target5Targets": MultiBinary(6),
        "target6Targets": MultiBinary(6),
        "assets": MultiDiscrete([100,100])})
    def getRandomizedState(self):
        return {"missiles": random.randint(1, 99),
         "expectedShipDamage": 
         np.array([random.randint(0, 4), 
         random.randint(0, 4), 
         random.randint(0, 4), 
         random.randint(0, 4), 
         random.randint(0, 4), 
         random.randint(0, 4)]), 
         "currentShipDamage": np.array([0,0,0,0,0,0]),
         "target1Defense": np.array([0,0,25,0,15,0]),
         "target2Defense": np.array([0,0,0,25,0,15]),
         "target3Defense": np.array([0,0,30,0,0,0]),
         "target4Defense": np.array([0,0,0,30,0,0]),
         "target5Defense": np.array([0,0,0,0,40,0]),
         "target6Defense": np.array([0,0,0,0,0,40]),
         "target1Targets": np.array([1, 0, 1, 0 , 0, 0]),
         "target2Targets": np.array([0, 1, 0, 1 , 0, 0]),
         "target3Targets": np.array([1, 0, 1, 0 , 0, 0]),
         "target4Targets": np.array([0, 1, 0, 1 , 0, 0]),
         "target5Targets": np.array([0, 0, 0, 0 , 1, 1]),
         "target6Targets": np.array([0, 0, 0, 0 , 1, 1]),
         "assets": np.array([random.randint(1, 99), random.randint(1, 99)])}
    def getState(self, numberOfmissiles, numberOfJets, numberOfPilots, tD1, tD2, tD3, tD4, tD5, tD6):
        return {"missiles": numberOfmissiles,
         "expectedShipDamage": np.array([tD1, tD2, tD3, tD4, tD5, tD6]), 
         "currentShipDamage": np.array([0,0,0,0,0,0]),
         "target1Defense": np.array([0,0,25,0,15,0]),
         "target2Defense": np.array([0,0,0,25,0,15]),
         "target3Defense": np.array([0,0,30,0,0,0]),
         "target4Defense": np.array([0,0,0,30,0,0]),
         "target5Defense": np.array([0,0,0,0,40,0]),
         "target6Defense": np.array([0,0,0,0,0,40]),
         "target1Targets": np.array([1, 0, 1, 0 , 0, 0]),
         "target2Targets": np.array([0, 1, 0, 1 , 0, 0]),
         "target3Targets": np.array([1, 0, 1, 0 , 0, 0]),
         "target4Targets": np.array([0, 1, 0, 1 , 0, 0]),
         "target5Targets": np.array([0, 0, 0, 0 , 1, 1]),
         "target6Targets": np.array([0, 0, 0, 0 , 1, 1]),
         "assets": np.array([numberOfJets ,numberOfPilots])}
    def canAttack(self, state, sortiArray, ship1, ship2):
        canAttack = False
        reward = -1000
        # Check if can attack
        for sorti in sortiArray:
            if state[sorti][ship1] and state[sorti][ship2]:
                canAttack = True
                reward = 20
                break
        return canAttack, reward
    def shouldAttack(self, state, defenseArray, ship):
        reward = 0
        shouldAttack = True
        # Check if ship is already more damaged than expected
        if state["currentShipDamage"][ship] >= state["expectedShipDamage"][ship]:
            # Check to see if target posses a threat
            if(state["currentShipDamage"][ship] == 0 and state[defenseArray[ship]][ship] > 0):
                # Reward for damaging a ship that is a threat?
                reward = 0
            else:
                # no threat and already at expected damage
                reward = -200
                shouldAttack = False
        else:
            # targetted a ship worth targetting
            reward = 100
        return shouldAttack, reward
    def defendShip(self, state, defenseArray, ship):
        shotDown = False
        length = len(state[defenseArray[ship]])
        for index in range(0, length):
            if(state[defenseArray[ship]][index] > 0):
                roll = random.randint(0,100)
                if(roll <= state[defenseArray[ship]][index]):
                    shotDown = True
        return shotDown
    def shootShip(self, state, defenseArray, ship):
        reward = 0

        state["currentShipDamage"][ship] += 1

        return reward
    def step(self, state, action):
        # Set placeholder for info
        info = {}
        reward = 0
        done = False
        numberOfTargets = 6
        defenseArray = ["target1Defense", "target2Defense", "target3Defense", "target4Defense", "target5Defense", "target6Defense"]
        sortieArray = ["target1Targets", "target2Targets", "target3Targets", "target4Targets", "target5Targets", "target6Targets"]

        # Should we reward here or after hit or even after checking against expected damage?
        ship1Index = action[0]
        ship2Index = action[1]

        if action[2] == 1:
            reward += 10
        if action[3] == 1:
            reward += 10

        canAttack , canAttackReward = self.canAttack(state, sortieArray, ship1Index, ship2Index)
        reward += canAttackReward
        if canAttack:

            shotDown1 = self.defendShip(state, defenseArray, ship1Index)
            shotDown2 = self.defendShip(state, defenseArray, ship2Index)

            if ship1Index == ship2Index:
                if shotDown1:
                    state["assets"][0] -= 1
                    state["assets"][1] -= 1
            else:
                if shotDown1:
                    state["assets"][0] -= 1
                    state["assets"][1] -= 1
                if shotDown2:
                    state["assets"][0] -= 1
                    state["assets"][1] -= 1

            shouldAttack, shouldAttackReward = self.shouldAttack(state, defenseArray, ship1Index)
            reward += shouldAttackReward
            self.shootShip(state, defenseArray, ship1Index)
            state["missiles"] -= 1
            if action[2] == 1:
                shouldAttack, shouldAttackReward = self.shouldAttack(state, defenseArray, ship1Index)
                reward += shouldAttackReward
                self.shootShip(state, defenseArray, ship1Index)
                state["missiles"] -= 1
            shouldAttack, shouldAttackReward = self.shouldAttack(state, defenseArray, ship2Index)
            reward += shouldAttackReward
            self.shootShip(state, defenseArray, ship2Index)
            state["missiles"] -= 1
            if action[3] == 1:
                shouldAttack, shouldAttackReward = self.shouldAttack(state, defenseArray, ship2Index)
                reward += shouldAttackReward
                self.shootShip(state, defenseArray, ship2Index)
                state["missiles"] -= 1
        else:
            # Can't attack we are done
            return state, canAttackReward, False, info
        
        # ############# Ideas #################
        # Count up assets instead of down. Add a cap and remove end condition for assets
        # Change Reward/Loss system for expected damage.
        #   Expected damage is now Expected hits. Remove Rolls
        # Add a calculated ratio reward for threats in the defending arrays
        # Add in standard deviation for rewards to encourage spreading out more hits when missles are available.
        #   Prio higher expected hits targets.

        # Add penalty for going over on assets
        if state["missiles"] < 0:
            reward -= 2000

        if state ["assets"][0] < 0:
            reward -= 200

        if state ["assets"][1] < 0:
            reward -= 200

        isExpectedDamageMet = True
        for shipIndex in range(0, numberOfTargets):
            if state["currentShipDamage"][shipIndex] < state["expectedShipDamage"][shipIndex]:
                isExpectedDamageMet = False
                break

        if isExpectedDamageMet:
            reward += 100
            reward += max(0, state["missiles"])*10
            reward += max(0, state["assets"][0])*5
            reward += max(0, state["assets"][1])*5
            done = True

        if state["missiles"] <= 0:
            done = True

        if state["assets"][0] <= 0 or state["assets"][1] <= 0:
            done = True
        
        # Return step information
        return state, reward, done, info

class DeterministicScenarioEnvironment(Env):
    def __init__(self, numberOfmissiles, numberOfJets, numberOfPilots, tD1, tD2, tD3, tD4, tD5, tD6):
        manager = DeterministicScenario()
        self.manager = manager
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = manager.getActionSpace()
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = manager.getObservationSpace()
        # store initial state
        self.numberOfmissiles = numberOfmissiles
        self.numberOfJets = numberOfJets
        self.numberOfPilots = numberOfPilots
        self.tD1 = tD1
        self.tD2 = tD2
        self.tD3 = tD3
        self.tD4 = tD4
        self.tD5 = tD5
        self.tD6 = tD6
        # Set start state
        self.state = self.manager.getState(self.numberOfmissiles, self.numberOfJets, self.numberOfPilots, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6)

    def step(self, action):
        # Return step information
        return self.manager.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = self.manager.getState(self.numberOfmissiles, self.numberOfJets, self.numberOfPilots, self.tD1, self.tD2, self.tD3, self.tD4, self.tD5, self.tD6)
        return self.state

class TrainingDeterministicScenarioEnvironment(Env):
    def __init__(self):
        manager = DeterministicScenario()
        self.manager = manager
        # Actions we can take: 0 - Do Nothing, 1 - Launch
        self.action_space = manager.getActionSpace()
        # Target Damage state array: 0 - Untouched, 1 - Disabled, 2 - Destroyed
        self.observation_space = manager.getObservationSpace()
        # Set start state
        self.state = manager.getRandomizedState()

    def step(self, action):
        # Return step information
        return self.manager.step(self.state, action)

    def render(self):
        # Implement viz
        pass

    def reset(self):
        # Reset shower temperature
        self.state = self.manager.getRandomizedState()
        return self.state


Check Scenario Environment

# Train Model

In [3]:
log_path = os.path.join('Training', 'Logs')

In [4]:
#trainingEnv=DeterministicScenarioEnvironment(13, 19, 21, 4, 2, 2, 2, 0, 0)
trainingEnv=TrainingDeterministicScenarioEnvironment()
check_env(trainingEnv, warn=True)
trainingEnv.reset()
model = PPO("MultiInputPolicy", trainingEnv, verbose=1, tensorboard_log=log_path)

# 30ish minute training
#model.learn(total_timesteps=300000*5)

#model.learn(total_timesteps=700000*2)

model.learn(total_timesteps=150000*2*4)

model.save('RandomDeterministicTraining3')

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
Logging to Training\Logs\PPO_90
----------------------------------
| rollout/           |           |
|    ep_len_mean     | 29.7      |
|    ep_rew_mean     | -2.19e+04 |
| time/              |           |
|    fps             | 530       |
|    iterations      | 1         |
|    time_elapsed    | 3         |
|    total_timesteps | 2048      |
----------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 28.9         |
|    ep_rew_mean          | -2.12e+04    |
| time/                   |              |
|    fps                  | 442          |
|    iterations           | 2            |
|    time_elapsed         | 9            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 7.202296e-06 |
|    clip_fraction        | 0            |
|    cli

In [5]:
model.learn(total_timesteps=150000*2*4)

model.save('RandomDeterministicTraining3')

Logging to Training\Logs\PPO_91
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 25.2     |
|    ep_rew_mean     | -2e+04   |
| time/              |          |
|    fps             | 768      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------
------------------------------------------
| rollout/                |              |
|    ep_len_mean          | 25           |
|    ep_rew_mean          | -1.95e+04    |
| time/                   |              |
|    fps                  | 502          |
|    iterations           | 2            |
|    time_elapsed         | 8            |
|    total_timesteps      | 4096         |
| train/                  |              |
|    approx_kl            | 8.694321e-05 |
|    clip_fraction        | 0            |
|    clip_range           | 0.2          |
|    entropy_loss         | -4.62        |
|    explained_variance   

# Save Model

# Evaluate Model

In [26]:
scenarioEnv = DeterministicScenarioEnvironment(13, 19, 21, 4, 2, 2, 2, 0, 0)
sampleEnvObs = scenarioEnv.reset()
done = False
score = 0 

while not done:
    action, _ = model.predict(sampleEnvObs)
    sampleEnvObs, reward, done, info = scenarioEnv.step(action)
    score+=reward
    print('Score:{} Action:{} State:{}'.format(score, action, [sampleEnvObs["missiles"],sampleEnvObs["expectedShipDamage"], sampleEnvObs["currentShipDamage"], sampleEnvObs["assets"]]))
scenarioEnv.close()

Score:-360 Action:[5 4 1 1] State:[9, array([4, 2, 2, 2, 0, 0]), array([0, 0, 0, 0, 2, 2]), array([18, 20])]
Score:-1120 Action:[5 4 1 1] State:[5, array([4, 2, 2, 2, 0, 0]), array([0, 0, 0, 0, 4, 4]), array([16, 18])]
Score:-1880 Action:[5 4 1 1] State:[1, array([4, 2, 2, 2, 0, 0]), array([0, 0, 0, 0, 6, 6]), array([14, 16])]
Score:-4640 Action:[5 4 1 1] State:[-3, array([4, 2, 2, 2, 0, 0]), array([0, 0, 0, 0, 8, 8]), array([13, 15])]
