In [1]:
import torch
import torch.nn as nn
import datetime
import torch.nn.functional as F
from env_parallel import Env
from ppo import Agent
import torch.nn.init as init
import numpy as np

In [2]:
class PolicyValueModel(nn.Module):
    def __init__(self, count_of_actions):
        super(PolicyValueModel, self).__init__()

        # self.conv1 = nn.Conv1d(1, 16, 3)
        # self.conv2 = nn.Conv1d(16, 32, 3)

        self.fc_p1 = nn.Linear(count_of_actions, 16)
        self.fc_p2 = nn.Linear(16, count_of_actions)

        self.fc_v1 = nn.Linear(count_of_actions, 16)
        self.fc_v2 = nn.Linear(16, 1)

        features_layers = []  # [self.conv1, self.conv2]
        for layer in features_layers:
            torch.nn.init.xavier_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

        output_layers = [self.fc_p1, self.fc_p2, self.fc_v1, self.fc_v2]
        for layer in output_layers:
            torch.nn.init.xavier_normal_(layer.weight)
            torch.nn.init.zeros_(layer.bias)

    def forward(self, x):
        # x = self.conv1(x)
        # x = self.conv2(x)

        # x = x.view(-1, 32)

        x_logit = F.relu(self.fc_p1(x))
        logit = self.fc_p2(x_logit)

        x_value = F.relu(self.fc_v1(x))
        value = self.fc_v2(x_value)

        return logit, value


In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print('device: ', device)
start_date = datetime.datetime.now()

env_candidates = 87
env_p = 8
env_count = 50
results_path = 'results/'

env = Env(env_p, env_candidates, env_count, 'data/BA-87', device)

net = PolicyValueModel(env_candidates)
# net = torch.load('models/save.net')

agent = Agent(net, device=device,
              lr=0.01, name='p_med', results_path=results_path)

agent.train(env=env, count_of_envs=env_count, input_dim=(env_candidates,),
            count_of_iterations=5, count_of_steps=512, batch_size=512)

# agent.test(env)

print(datetime.datetime.now() - start_date)


device:  cpu
optimizer: Adam
Training is starting
iteration:        0 	epsiode:  50 	score:  -104.4750 	avg score:  -61.9758 	best score:  -33.4000 	best avg score:  -61.9758 	Best obj: 33400.0
iteration:        0 	epsiode:  100 	score:  -51.5100 	avg score:  -62.2805 	best score:  -33.4000 	best avg score:  -61.9758 	Best obj: 33400.0
iteration:        0 	epsiode:  150 	score:  -60.8280 	avg score:  -60.3605 	best score:  -33.4000 	best avg score:  -60.3605 	Best obj: 33400.0
iteration:        0 	epsiode:  200 	score:  -54.0540 	avg score:  -60.0893 	best score:  -33.4000 	best avg score:  -60.0893 	Best obj: 33400.0
iteration:        0 	epsiode:  250 	score:  -38.3200 	avg score:  -61.2453 	best score:  -33.1340 	best avg score:  -60.0893 	Best obj: 33134.0
iteration:        0 	epsiode:  300 	score:  -67.8260 	avg score:  -63.3155 	best score:  -33.1340 	best avg score:  -60.0893 	Best obj: 33134.0
iteration:        0 	epsiode:  350 	score:  -38.1950 	avg score:  -62.8943 	best score