# Minimalist Tower RTS Agent
Sam Greydanus

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import torch, time, os, glob
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F

from towers import Towers
from rtsenv import RTSEnv

This implementation generally follows the documentation of Zoe's version. Changes: tower locations, within quadrants, are randomized. Tower values ("healths" aka "hit points"), within quadrants, are randomized. Agent value is randomized. Final reward is "tower value - agent value IF agent value is greater ELSE reward is -3". Friend towers are treated the same as enemy towers in the reward computation, except their magnitudes are negative.

Channel Overview
 * channel 1 - hit point channel **NON BINARY**
 * channel 2 - agent mask
 * channel 3 - small tower mask
 * channel 4 - large tower mask
 * channel 5 - friendly mask
 * channel 6 - enemy mask

## Define a network

In [2]:
class NNPolicy(torch.nn.Module): # an actor-critic neural network
    def __init__(self, num_actions):
        super(NNPolicy, self).__init__()
        self.conv1 = nn.Conv2d(6, 16, 2, stride=2, padding=1)
        self.conv2 = nn.Conv2d(16, 16, 2, stride=2, padding=1)
        self.conv3 = nn.Conv2d(16, 16, 2, stride=2, padding=1)
        self.flat_dim = flat_dim = 16 * 3 * 3
        self.critic_linear, self.actor_linear = nn.Linear(flat_dim, 1), nn.Linear(flat_dim, num_actions)

    def forward(self, inputs):
        x = F.elu(self.conv1(inputs))
        x = F.elu(self.conv2(x))
        x = F.elu(self.conv3(x))
        hx = x.view(-1, self.flat_dim)
        value, probs = self.critic_linear(hx), F.softmax(self.actor_linear(hx), dim=1)
        return value, probs
    
    def try_load(self, save_dir):
        paths = glob.glob(save_dir + '*.tar') ; rew = None
        if len(paths) > 0:
            ckpts = [float(s.split('_')[-2]) for s in paths]
            ix = np.argmax(ckpts) ; rew = ckpts[ix]
            self.load_state_dict(torch.load(paths[ix]))
        print("\tno saved models") if rew is None else print("\tloaded model: {}".format(paths[ix]))
        return rew

## Define some hyperparameters

In [3]:
class Args():
    pass
args = Args()
args.lr = 1e-3
args.num_actions = 4
args.batch_size = 128
args.total_steps = 20000 # just set this to a big number
args.printevery = 5 # print stats every n seconds
args.saveevery = 30 # save model every n seconds
args.rewardthresh = -0.3
args.save_dir = './saved/'

os.makedirs(args.save_dir) if not os.path.exists(args.save_dir) else None # make dir to save models etc.

## Explore the environment, train an agent

PyTorch has a great API for reinforcement learning now

In [4]:
model = NNPolicy(num_actions=args.num_actions)
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
run_vloss, run_ploss, run_reward = None, None, None

print('training a simple policy gradient CNN agent on the towers RTS game')
print_t = save_t = time.time()
for step in range(args.total_steps + 1):
    envs = [Towers() for i in range(args.batch_size)]
    obs = np.stack([e.reset() for e in envs])
    state = Variable(torch.Tensor(obs))
    value, probs = model(state)

    m = torch.distributions.Categorical(probs)
    actions = m.sample()
    raw_reward = [e.step(actions.data.numpy().ravel()[i])[1] for i, e in enumerate(envs)]
    raw_reward = Variable(torch.Tensor(raw_reward))
    norm_reward = (raw_reward - raw_reward.mean())/raw_reward.std() # this sometimes helps convergence

    vloss = 0.5*(value.view(-1) - norm_reward).pow(2).mean()
    ploss = (-m.log_prob(actions) * norm_reward).mean()
    (vloss + ploss).backward()
    optimizer.step()
    optimizer.zero_grad()

    np_vloss = vloss.data.view(-1)[0]
    np_ploss = ploss.data.view(-1)[0]
    np_reward = raw_reward.mean().data.view(-1)[0]

    run_vloss = np_vloss if run_vloss is None else 0.99*run_ploss + 0.01*np_ploss
    run_ploss = np_ploss if run_ploss is None else 0.99*run_ploss + 0.01*np_ploss
    run_reward = np_reward if run_reward is None else 0.99*run_reward + 0.01*np_reward
    
    if time.time() - print_t > args.printevery:
        print_t = time.time()
        print('\tstep {}, value loss {:.3f}, policy loss {:.3f}, reward {:.3f}'
              .format(step, run_vloss, run_ploss, run_reward))
    
    if time.time() - save_t > args.saveevery:
        save_t = time.time()
        paths = glob.glob(args.save_dir + '*.tar') ; rew_best_saved_model = -10000
        if len(paths) > 0:
            ckpts = [float(s.split('_')[-2]) for s in paths]
            ix = np.argmax(ckpts) ; rew_best_saved_model = ckpts[ix]
        if rew_best_saved_model < run_reward:
            name = 'model_step_{:.0f}_reward_{:.2f}_.tar'.format(step,run_reward)
            print('\t\tsaved {}'.format(name))
            torch.save(model.state_dict(), args.save_dir + name)
        else:
            print('\t\tlol your model sucks keep training')
        
    if run_reward > args.rewardthresh:
        print("environment solved")
        name = 'model_step_{:.0f}_reward_{:.2f}_.tar'.format(step,run_reward) ; print('\t\tsaved {}'.format(name))
        torch.save(model.state_dict(), args.save_dir + name)
        break

training a simple policy gradient CNN agent on the towers RTS game
	step 139, value loss -0.003, policy loss -0.003, reward -1.079
	step 293, value loss -0.023, policy loss -0.023, reward -1.062
	step 428, value loss -0.127, policy loss -0.127, reward -0.634
	step 569, value loss -0.088, policy loss -0.088, reward -0.430
	step 695, value loss -0.084, policy loss -0.084, reward -0.385
		saved model_step_830_reward_-0.35_.tar
	step 832, value loss -0.088, policy loss -0.088, reward -0.354
	step 974, value loss -0.098, policy loss -0.098, reward -0.347
	step 1102, value loss -0.086, policy loss -0.086, reward -0.352
	step 1231, value loss -0.083, policy loss -0.083, reward -0.338
	step 1379, value loss -0.082, policy loss -0.082, reward -0.329
	step 1493, value loss -0.093, policy loss -0.093, reward -0.334
		saved model_step_1587_reward_-0.34_.tar
	step 1592, value loss -0.095, policy loss -0.095, reward -0.334
	step 1712, value loss -0.099, policy loss -0.099, reward -0.332
	step 1831, 

## Try loading a saved model

In [6]:
model = NNPolicy(num_actions=args.num_actions)
ckpt_reward = model.try_load(args.save_dir)
print('this saved model had a mean reward of {:.2f}'.format(ckpt_reward))

	loaded model: ./saved/model_step_2242_reward_-0.30_.tar
this saved model had a mean reward of -0.30
