In [1]:
import random
import math
from collections import defaultdict, namedtuple
from itertools import count

import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

torch.manual_seed(3001)

from agent import *
from game import *
from trainer import *
from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
class DQN(nn.Module):

    def __init__(self, inp_size, emb_size, hid_size, out_size):
        super(DQN, self).__init__()
        self.emb = nn.Linear(inp_size, emb_size)
        
        self.mid = nn.Linear(emb_size, hid_size)
        self.out = nn.Linear(hid_size, out_size)
        
    def forward(self, x):
        emb = F.relu(self.emb(x))
        out = F.relu(self.mid(emb))
        
        act = self.out(out)
        return F.softmax(act, dim=-1)

In [3]:
def stateMax(params):
    return torch.tensor([params['idx']/params['n_idx'], params['val']/params['hi']]).unsqueeze(0)

In [4]:
dqn_net_params = {'inp_size':2,
                  'emb_size':100,
                  'hid_size':256,
                  'out_size':2
                 }

policy_net = DQN(inp_size=2, emb_size=100, hid_size=256, out_size=2).to(device)
target_net = DQN(inp_size=2, emb_size=100, hid_size=256, out_size=2).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

memory = ReplayMemory(100_000)

dqn_params = {'batch_size':128,
              'gamma':0.9,
              'eps':0.1,
              'eps_decay':1e-5,
              'target_update':1000,
              'p_to_s':stateMax,
              'p_net':policy_net,
              't_net':target_net,
              'optimizer':optim.RMSprop(policy_net.parameters()),
              'loss':F.mse_loss,
              'memory':memory
              }

agent = DQAgent(**dqn_params)

In [3]:
nn.L1Loss()

L1Loss()

In [5]:
game_params = {'lo':1,
               'hi':10000,
               'n_idx':50,
               'replace':False,
               'reward_fn': rewardTopN,
               'reward':{'n':11, 'pos':10, 'neg':-10}
              }
  
game = Game(**game_params)

In [7]:
curr_params = {"epoch":100, 
               'params':{"pos":0,
                         "neg":0,
                         "op":"-",
                         "n":1
                        }
              }

trainer = DQTrainer()
trainer.train(game, agent, 1_000, 100, 0, curr_params)
trainer.eval(game, agent, 100, 10, 0)

TRAINING COMPLETE |	 FINAL VICTORY PERCENTAGE: 0.03


0.03

TRAINING COMPLETE |	 FINAL VICTORY PERCENTAGE: 0.14


0.14

'Eval'

167