In [None]:
from MultiActorEnv import MultiActorEnv
from FCNN import FCNN

In [None]:
env = MultiActorEnv(actor_number=4, height=10, width=10, obs_count=5, random_seed=100)
print(env.grid)
env.display()
print("-------------------------------------------------------------")
rewards, dones = env.step(['r', 'l', 'r', 'l'])
print(env.grid)
env.display()
print(rewards)
print("-------------------------------------------------------------")
rewards, dones = env.step(['u', 'd', 'u', 'd'])
print(env.grid)
env.display()
print(rewards)
print("-------------------------------------------------------------")
rewards, dones = env.step(['u', 'd', 'u', 'd'])
print(env.grid)
env.display()
print(rewards)
print("-------------------------------------------------------------")
rewards, dones = env.step(['d', 'u', 'd', 'u'])
print(env.grid)
env.display()
print(rewards)

In [None]:
env.reset()
print(env.grid)
env.display()

In [None]:
input_dim = 4 * 10 * 10 + 5
model = FCNN(input_dim)

In [None]:
def init_seed(torch_seed=100, random_seed=100, np_random_seed=100):
    torch.manual_seed(torch_seed)
    random.seed(random_seed)
    np.random.seed(np_random_seed)
    torch.use_deterministic_algorithms(True)

def init_weights(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.00)

import numpy as np
import matplotlib.pyplot as plt
def MovingAveragePlot(input_list, window_size):
    plt.subplots(figsize=(14, 7))

    plt.subplot(1, 2, 1)
    plt.plot(range(len(input_list)), input_list)

    plt.subplot(1, 2, 2)
    window = np.ones(int(window_size))/float(window_size)
    ave_values = np.convolve(input_list, window, 'valid')
    plt.plot(range(len(ave_values)), ave_values)

    plt.show()

In [None]:
import torch
import torch.nn as nn  # neural network modules
import torch.nn.functional as F  # activation functions
import torch.optim as optim  # optimizer
from torch.autograd import Variable # add gradients to tensors
from torch.nn import Parameter # model parameter functionality
import random
import numpy as np

In [None]:
def TrainQlearning(model: nn.Module, env: MultiActorEnv, loss: torch.nn.modules.loss = nn.MSELoss()):
    '''
    This function trains the model using the Q-learning algorithm
    '''
    lr = 0.001
    num_of_play, max_play_length = 1000, 50
    gamma, epsilon, epsilon_low, epsilon_step = 0.99, 0.5, 0.1, 0.05
    optimizer = optim.Adam(model.parameters(), lr=lr)
    loss_func = loss

    init_seed()
    model.apply(init_weights)
    model.train()

    rewards = []

    for i in range(1, num_of_play+1):
        done = False
        counter = 0
        env.reset()
            
        if i % 100 == 0:
            print("play round: {}, ave reward (last {}): {:.4f}".format(
                i, 100, sum(rewards[-100:])/100))
            if epsilon > epsilon_low:
                epsilon -= epsilon_step
        while (counter <= max_play_length):
            counter += 1
            actor_number = env.get_actor_number()
            if (actor_number==0): break
            
            inputs = []            
            choices = []
            for actor_id in range(actor_number):
                # find Q(s_{t},a) for all actions
                preds = []
                state = env.get_state(actor_id)
                for action in env.actions:
                    pred = model(state, action)
                    preds.append(pred)

                p = np.random.uniform(0, 1)
                choice = -1
                if p < epsilon:
                    choice = np.random.randint(0, 4)
                else:
                    list_pred = [x.item() for x in preds]
                    max_pred = np.amax(list_pred)
                    max_positions = np.argwhere(
                        list_pred == max_pred).flatten().tolist()
                    choice = random.choice(max_positions)
                
                choices.append(choice)
                inputs.append(preds[choice])
                
            # take the action, s_{t},a -> s_{t+1}
            # get the immediate reward
            imm_reward, dones = env.step(choices)
            
            targets = []
            for actor_id in range(actor_number):
                future_reward = 0
                # find Q(s_{t+1},a) for all actions
                if not dones[actor_id]:
                    with torch.no_grad():
                        next_preds = []
                        state = env.get_state(actor_id)
                        for action in env.actions:
                            next_pred = model(state, action)
                            next_preds.append(next_pred.item())
                        future_reward = max(next_preds)
                        
                # Q(s,a|t) = r + gamma*max[Q(s,a|t+1)]
                tot_reward = imm_reward[actor_id] + gamma * future_reward
                rewards.append(tot_reward)
                targets.append(torch.Tensor([tot_reward]))

            inputs = torch.stack(inputs, dim=0)
            targets = torch.stack(targets, dim=0)
            #print(inputs)
            #print(targets)
            #break
            loss = loss_func(inputs, targets)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
            env.remove_dones()

    return rewards

In [None]:
rewards = TrainQlearning(model, env)

In [None]:
MovingAveragePlot(rewards, 100)

In [None]:
def get_optimal_movs(env, model):
    movs = [[],[]]
    actor_number = env.get_actor_number()
    if (actor_number==0): return movs
    for actor_id in range(actor_number):
        preds = []
        state = env.get_state(actor_id)
        for action in env.actions:
            pred = model(state, action)
            preds.append(pred)
        choice = -1
        list_pred = [x.item() for x in preds]
        max_pred = np.amax(list_pred)
        max_positions = np.argwhere(list_pred == max_pred).flatten().tolist()
        choice = random.choice(max_positions)
        choices.append(choice)
        if choice == 0:
            movs[0].append(0)
            movs[1].append(-1)
        elif choice == 1:
            movs[0].append(0)
            movs[1].append(1)
        elif choice == 2:
            movs[0].append(-1)
            movs[1].append(0)
        elif choice == 3:
            movs[0].append(1)
            movs[1].append(0)
        elif choice == 4:
            movs[0].append(0)
            movs[1].append(0)
        else:
            assert False, "Unknow operation"
    return movs, choices

In [None]:
env.reset()
print(env.grid)
env.display()
print(env.get_actor_number())
print("-------------------------------------------------------------")
counter = 0
max_play_length = 50
model.eval()
while (counter <= max_play_length and env.get_actor_number() > 0):
    # find Q(s_{t},a) for all actions
    
    counter += 1
    actor_number = env.get_actor_number()
    if (actor_number==0): break
    
    choices = []
    for actor_id in range(actor_number):
        preds = []
        state = env.get_state(actor_id)
        for action in env.actions:
            pred = model(state, action)
            preds.append(pred)
        choice = -1
        list_pred = [x.item() for x in preds]
        print(list_pred)
        max_pred = np.amax(list_pred)
        max_positions = np.argwhere(list_pred == max_pred).flatten().tolist()
        choice = random.choice(max_positions)
        choices.append(choice)
    print(choices)
    env.step(choices)
    print(env.grid)
    env.display()
    env.remove_dones()
    print(env.get_actor_number())
    print("-------------------------------------------------------------")

In [None]:
env.reset()
print(env.grid)
env.display()

env.step(['r', 'l', 'l', 'r'])
print(env.grid)
env.display()

In [None]:
def RoutePlot(width, height, obs, tgts, locs, movs=None, image_name='foo.png'):
    plt.figure(figsize=(width, height))
    
    plt.xlim([-0.5, width-0.5])
    plt.ylim([-0.5, height-0.5])
    x_major_ticks = np.arange(-0.5, width+0.5, 1)
    y_major_ticks = np.arange(-0.5, height+0.5, 1)
    plt.xticks(x_major_ticks)
    plt.yticks(y_major_ticks)
    
    ax = plt.gca()
    ax.axes.xaxis.set_ticklabels([])
    ax.axes.yaxis.set_ticklabels([])
    ax.tick_params(axis="x",direction="in")
    ax.tick_params(axis="y",direction="in")
    plt.grid()
    
    plt.gca().invert_yaxis()
    
    # show obs
    plt.scatter(obs[0], obs[1], s=500, marker='x', color='r')
    
    # show tgts
    plt.scatter(tgts[0], tgts[1], s=1000, marker='o', color=(0.5, 0.5, 1.0, 0.75))
    for x,y,l in zip(tgts[0], tgts[1], tgts[2]):
        label = "{}".format(l)
        plt.annotate(label, (x,y), size=20, textcoords="offset points", xytext=(0,-7), ha='center')
    
    # show mov direction
    if movs is not None:
        for i in range(len(locs[0])-len(movs[0])):
            movs[0].append(0)
            movs[1].append(0)
        plt.quiver(locs[0], locs[1], movs[0], movs[1], color=(0, 0.2, 0), angles='xy', scale_units='xy', scale=1)
    
    # show locs
    plt.scatter(locs[0], locs[1], s=1000, marker='o', color=(0.0, 0.5, 0.0))
    for x,y,l in zip(locs[0], locs[1], locs[2]):
        label = "{}".format(l)
        plt.annotate(label, (x,y), size=20, textcoords="offset points", xytext=(0,-7), ha='center')
    
    plt.savefig(image_name)

In [None]:
width = 10
height = 10
env.reset()
obs = env.get_obs()
tgts = env.get_tgts()
locs = env.get_locs()
movs, _ = get_optimal_movs(env, model)
RoutePlot(width, height, obs, tgts, locs, movs)

In [None]:
env.reset()

width = 10
height = 10

counter = 0
max_play_length = 50
model.eval()
while (counter <= max_play_length and env.get_actor_number() > 0):
    # find Q(s_{t},a) for all actions
    actor_number = env.get_actor_number()
    if (actor_number==0): break
    
    choices = []
    for actor_id in range(actor_number):
        preds = []
        state = env.get_state(actor_id)
        for action in env.actions:
            pred = model(state, action)
            preds.append(pred)
        choice = -1
        list_pred = [x.item() for x in preds]
        max_pred = np.amax(list_pred)
        max_positions = np.argwhere(list_pred == max_pred).flatten().tolist()
        choice = random.choice(max_positions)
        choices.append(choice)
        
    
    obs = env.get_obs()
    tgts = env.get_tgts()
    locs = env.get_locs()
    movs, _ = get_optimal_movs(env, model)
    RoutePlot(width, height, obs, tgts, locs, movs, str(counter)+".png")
    
    env.step(choices)
    env.remove_dones()
    counter += 1
    
obs = env.get_obs()
tgts = env.get_tgts()
locs = env.get_locs()
RoutePlot(width, height, obs, tgts, locs, None, str(counter)+".png")