In [1]:
import gym
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

In [2]:
# CARTPOLE GAME SETTINGS
OBSERVATION_SPACE_DIMS = 4
ACTION_SPACE = [0,1]

# AGENT/NETWORK HYPERPARAMETERS
HIDDEN_UNITS = 64
EPSILON_INITIAL = 0.5 # exploration rate
EPSILON_DECAY = 0.99
EPSILON_MIN = 0.01
ALPHA = 0.001 # learning rate
GAMMA = 0.99 # discount factor
TAU = 0.1 # target network soft update hyperparameter
EXPERIENCE_REPLAY_BATCH_SIZE = 32
AGENT_MEMORY_LIMIT = 2000
MIN_MEMORY_FOR_EXPERIENCE_REPLAY = 500

dtype = torch.float64

In [3]:
# same environment as last week

class Environment:
    def __init__(self, ip = "127.0.0.1", port = 13000):
        self.client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.ip     = ip
        self.port   = port

        self.client.connect((ip, port))

    def reset(self):
        self._send(0, 0)
        return self._receive()

    def step(self, action):
        self._send(action, 1)
        return self._receive()

    def _receive(self):
        data = self.client.recv(19)
        reward = data[0]
        state = [struct.unpack("@f", data[1 + i * 4: 5 + i * 4])[0] for i in range(4)]
        status = [data[17], data[18]]
        return reward, state, status

    def _send(self, action, command):
        self.client.send(bytes([action, command]))

In [4]:
class DQN(nn.Module):
    
    def __init__(self):
        super(DQN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(OBSERVATION_SPACE_DIMS, HIDDEN_UNITS),
            nn.ReLU(),
            nn.Linear(HIDDEN_UNITS, len(ACTION_SPACE))
        )
        
    def forward(self, x):
        return self.model(x)
    
    
class DQN(nn.Module):
    
    def __init__(self):
        super(DQN, self).__init__()

        self.model = torch.nn.Sequential(
            torch.nn.Linear(OBSERVATION_SPACE_DIMS, HIDDEN_UNITS),
            torch.nn.BatchNorm1d(HIDDEN_UNITS),
            torch.nn.PReLU(),
            torch.nn.Linear(HIDDEN_UNITS, HIDDEN_UNITS),
            torch.nn.BatchNorm1d(HIDDEN_UNITS),
            torch.nn.PReLU(),
            torch.nn.Linear(HIDDEN_UNITS, len(ACTION_SPACE))
        )
        
    def forward(self, x):
        return self.model(x)


In [5]:
class DoubleDQNAgent:
    
    def __init__(self):
        self.memory = []
        self.qnet = DQN()
        self.qhat = DQN()
        self.epsilon = EPSILON_INITIAL
        self.has_talked = False #??
        
    def _step(self, state):
        if self.epsilon > np.random.rand(): # explore
            return np.random.choice(ACTION_SPACE)
        else: # exploit
            state = torch.Tensor(state, dtype=dtype)
            q_val = self.qnet(state).detach().numpy()
            return np.argmax(q_val)
    
    def optimize(self, states, qvals):
        pass
    
    def experience_replay(self):
        minibatch = random.sample(self.memory, EXPERIENCE_REPLAY_BATCH_SIZE)
        minibatch_new_qvals = []
        minibatch_states = []
        
        for exp in minibatch:
            state, action, reward, next_state, done = exp
            minibatch_states.append(state)
            
            state = torch.Tensor(state)
            qvals = self.qnet(state)
            if done:
                q_update = reward
            else:
                next_state = torch.Tensor(next_state)
                selected_action = np.argmax(qvals)
                
                targetq = self.qhat(next_state)[selected_action]
                q_update = reward + GAMMA * targetq
            
            qvals[action] = q_update
            minibatch_new_qvals.append(qvals)
        
        minibatch_new_qvals = np.array(minibatch_new_qvals)
        self.optimize(minibatch_states, minibatch_new_qvals)
        
    def update_qhat(self):
        qnet_theta = self.qnet.get_weights()
        qhat_theta = self.qhat.get_weights()
        counter = 0
        for q_weight, target_weight in zip(qnet_theta, qhat_theta):
            target_weight = target_weight * (1-TAU) + q_weight * TAU
            target_network_theta[counter] = target_weight
            counter += 1
        #self.qhat.     #target_network.set_weights(target_network_theta)                

In [6]:
env = gym.make("CartPole-v1")
env.seed(1)

net = DQN()

  result = entry_point.load(False)


In [54]:
env.reset()
state, reward, done, _ = env.step(0)
net(torch.Tensor(state))

ValueError: expected 2D or 3D input (got 1D input)

In [64]:
#net.state_dict()["model.0.bias"].shape