OOP refactoring

AdityaSidharta · Dec 9, 2018 · e4371db · e4371db
1 parent b161120
commit e4371db
Show file tree

Hide file tree

Showing 15 changed files with 174 additions and 54 deletions.
diff --git a/.gitignore b/.gitignore
@@ -100,5 +100,8 @@ venv.bak/
 # mkdocs documentation
 /site
 
+# idea
+.idea/
+
 # mypy
 .mypy_cache/
diff --git a/model/__init__.py → __init__.py b/model/__init__.py → __init__.py
diff --git a/model/agent.py b/model/agent.py
diff --git a/model/config.py b/model/config.py
diff --git a/notebooks/reinforcement_q_learning.ipynb b/notebooks/reinforcement_q_learning.ipynb
@@ -682,7 +682,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.6"
+   "version": "3.6.5"
   }
  },
  "nbformat": 4,

diff --git a/src/__init__.py b/src/__init__.py
diff --git a/src/agent.py b/src/agent.py
@@ -0,0 +1,12 @@
+class Agent(object):
+    def __init__(self, learner, memory, policy, value_function, envs, config):
+        self.learner = learner
+        self.memory = memory
+        self.policy = policy
+        self.value_function = value_function
+        self.envs = envs
+        self.config = config
+
+    def train_agent(self, n_iteration):
+        pass
+
diff --git a/src/config.py b/src/config.py
@@ -0,0 +1,21 @@
+import torch
+
+BATCH_SIZE = 128
+GAMMA = 0.999
+EPS_START = 0.9
+EPS_END = 0.05
+EPS_DECAY = 200
+TARGET_UPDATE = 10
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else "cpu")
+
+
+class Config():
+    def __init__(self, batch_size, gamma, eps_start, eps_end, eps_decay, target_update, device):
+        self.batch_size = batch_size
+        self.gamma = gamma
+        self.eps_start = eps_start
+        self.eps_end = eps_end
+        self.eps_decay = eps_decay
+        self.target_update = target_update
+        self.device = device
+
diff --git a/model/envs.py → src/envs.py b/model/envs.py → src/envs.py
diff --git a/model/q_function.py → src/learner.py b/model/q_function.py → src/learner.py
@@ -1,39 +1,44 @@
 import torch
 
 
-# TODO finish Q-Function
-class Q_Function(object):
+class Optimize_Molel(object):
     def __init__(self, old_qnet, new_qnet, optimizer):
         self.new_qnet = new_qnet
         self.old_qnet = old_qnet
         self.old_qnet.load_state_dict(self.old_qnet.state_dict())
         self.old_qnet = self.old_qnet.eval()
         self.optimizer = optimizer
+        self.torch_device = self.config['DEVICE']
+        self.gamma =self.config['GAMMA']
+        self.batch_size = self.config['BATCH_SIZE']
 
     def calc_q(self, state_tensor):
         with torch.no_grad():
             return self.old_qnet(state_tensor).cpu().numpy()
 
-    def optimize_new_qnet(self, batch_size, memory, config):
+    def optimize_new_qnet(self, memory, config):
+        torch_device = config.device
+        gamma = config.gamma
+        batch_size = config.batch_size
+
+
         if len(memory) < batch_size:
             pass
         else:
             state_tensor, action_tensor, reward_tensor, next_state_tensor, finish_tensor = memory.sample(
-                batch_size, torch=True, device=config["DEVICE"]
+                batch_size, return_tensor=True, torch_device=torch_device
             )
 
+            finish_index = torch.nonzero(finish_tensor.view(-1)).view(-1)
             cur_q = self.new_qnet(state_tensor)
-            cur_qa = cur_q.gather(action_tensor)
-
-            n_actions = cur_q.shape[1]
+            cur_qa = cur_q.gather(1, action_tensor)
 
             with torch.no_grad():
-                unfinished_next_state_tensor = next_state_tensor[~finish_tensor]
-                next_q = torch.zeros_like(cur_q)
-                next_q[~finish_tensor] = old_net(unfinished_next_state_tensor)
+                unfinished_next_state_tensor = next_state_tensor[finish_index, :]
+                next_q = self.old_net(unfinished_next_state_tensor)
                 next_qa = next_q.max(1)[0]
+                exp_qa = reward_tensor + (gamma * next_qa)
 
-                exp_qa = reward_tensor + (config["GAMMA"] * next_qa)
-                # LOSS FUNCTION
+            # LOSS FUNCTION
 
         raise NotImplementedError()
diff --git a/model/memory.py → src/memory.py b/model/memory.py → src/memory.py
@@ -1,14 +1,15 @@
 import random
 import numpy as np
+import torch
 
 
 # State, Action, Reward, Next State
 class Memory(object):
     def __init__(self, capacity, n_state):
         self.capacity = capacity
-        self.memory = list()
-        self.position = 0
         self.n_state = n_state
+        self.memory = []
+        self.position = 0
 
     def __len__(self):
         return len(self.memory)
@@ -21,33 +22,33 @@ def update_position(self):
         if self.position == self.capacity:
             self.position = 0
 
-    def save(self, state, action, reward, next_state, finish):
+    def save(self, state: list, action: int, reward: float, next_state: list, finish: int):
         assert len(state) == self.n_state
-        if finish:
-            next_state = [np.nan] * self.n_state
+        next_state = [np.nan] * self.n_state if finish else next_state
         if self.is_memory_full():
             self.memory[self.position] = (state, action, reward, next_state, finish)
         else:
-            self.memory.extend([state, action, reward, next_state, finish])
+            self.memory.append([state, action, reward, next_state, finish])
         self.update_position()
 
-    def sample(self, sample_size, torch=False, device=None):
+    def sample(self, sample_size, return_tensor=False, torch_device=None):
+        if torch_device is None:
+            torch_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         if sample_size > len(self.memory):
             raise ValueError("Sample size is bigger than the memory size")
         else:
             sample_list = random.sample(self.memory, sample_size)
-            state = np.stack([x[0] for x in sample_list])
-            action = np.stack([x[1] for x in sample_list])
-            reward = np.stack([x[2] for x in sample_list])
-            next_state = np.stack([x[3] for x in sample_list])
-            finish = np.stack([x[4] for x in sample_list])
-        if torch:
-            assert device is not None
-            state_tensor = torch.from_numpy(state).to(device)
-            action_tensor = torch.from_numpy(action).to(device)
-            reward_tensor = torch.from_numpy(reward).to(device)
-            next_state_tensor = torch.from_numpy(next_state).to(device)
-            finish_tensor = torch.from_numpy(finish).to(device)
+            state = np.stack([x[0] for x in sample_list]).astype(float)
+            action = np.stack([x[1] for x in sample_list]).astype(int)
+            reward = np.stack([x[2] for x in sample_list]).astype(float)
+            next_state = np.stack([x[3] for x in sample_list]).astype(float)
+            finish = np.stack([x[4] for x in sample_list]).astype(int)
+        if return_tensor:
+            state_tensor = torch.from_numpy(state).to(torch_device, dtype=torch.float)
+            action_tensor = torch.from_numpy(action).to(torch_device, dtype=torch.int)
+            reward_tensor = torch.from_numpy(reward).to(torch_device, dtype=torch.float)
+            next_state_tensor = torch.from_numpy(next_state).to(torch_device, dtype=torch.float)
+            finish_tensor = torch.from_numpy(finish).to(torch_device, dtype=torch.int)
             return (
                 state_tensor,
                 action_tensor,

diff --git a/model/policy.py → src/policy.py b/model/policy.py → src/policy.py
@@ -19,6 +19,9 @@ def calc_eps_threshold(self):
     def update_n_actions(self):
         self.n_actions = self.n_actions + 1
 
+    def reset_n_actions(self):
+        self.n_actions = 0
+
     def select_action(self, value_function):
         eps_threshold = self.calc_eps_threshold()
         self.update_n_actions()

diff --git a/model/arch.py → src/value_function.py b/model/arch.py → src/value_function.py
@@ -1,9 +1,9 @@
 from torch import nn as nn
 
 
-class CartNet(nn.Module):
+class ValueFunction(nn.Module):
     def __init__(self, hidden_dim):
-        super(CartNet, self).__init__()
+        super(ValueFunction, self).__init__()
         self.hidden_dim = hidden_dim
         self.model_list = [
             nn.Linear(4, hidden_dim),

diff --git a/test/__init__.py b/test/__init__.py
diff --git a/test/test_memory.py b/test/test_memory.py
@@ -0,0 +1,95 @@
+from src.memory import Memory
+import random
+import torch
+
+
+def create_transitions(state=None,
+                       action=None,
+                       reward=None,
+                       next_state=None,
+                       finish=None,
+                       n_state=None):
+    n_state = 4 if n_state is None else n_state
+    state = [random.random() for x in range(n_state)] if state is None else state
+    action = random.sample([0, 1], 1)[0] if action is None else action
+    reward = random.random() if reward is None else reward
+    next_state = [random.random() for x in range(n_state)] if next_state is None else next_state
+    finish = random.sample([False, True], 1)[0] if finish is None else finish
+    return state, action, reward, next_state, finish
+
+
+def test_init():
+    memory = Memory(10, 4)
+    assert len(memory) == 0
+    assert not memory.is_memory_full()
+    assert memory.position == 0
+    assert memory.n_state == 4
+    assert memory.capacity == 10
+
+
+def test_len():
+    memory = Memory(10,4)
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert len(memory) == 5
+
+
+def test_is_memory_full():
+    memory = Memory(10, 4)
+    assert not memory.is_memory_full()
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert not memory.is_memory_full()
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert memory.is_memory_full()
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert memory.is_memory_full()
+
+
+def test_update_postiion():
+    memory = Memory(10, 4)
+    assert memory.position == 0
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert memory.position == 5
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert memory.position == 0
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions()
+        memory.save(state, action, reward, next_state, finish)
+    assert memory.position == 5
+
+
+def test_sample():
+    memory = Memory(10, 4)
+    for idx in range(5):
+        state, action, reward, next_state, finish = create_transitions(
+            state = [idx, idx, idx, idx],
+            reward = idx,
+            next_state = [idx, idx, idx, idx]
+        )
+        memory.save(state, action, reward, next_state, finish)
+    sampled_state, sampled_action, sampled_reward, sampled_next_state, sampled_finish = memory.sample(3)
+    assert sampled_state.shape == (3, 4)
+    assert sampled_next_state.shape == (3, 4)
+    assert sampled_state[:, 0] == sampled_reward
+
+    tensor_state, tensor_action, tensor_reward, tensor_next_state, tensor_finish = memory.sample(3, return_tensor=True)
+    assert tensor_state.shape == torch.Size([3, 4])
+    assert tensor_next_state.shape == torch.Size([3, 4])
+
+    assert tensor_state.type() == 'torch.cuda.FloatTensor'
+    assert tensor_action.type() == 'torch.cuda.IntTensor'
+    assert tensor_reward.type() == 'torch.cuda.FloatTensor'
+    assert tensor_next_state.type() == 'torch.cuda.FloatTensor'
+    assert tensor_finish.type() == 'torch.cuda.IntTensor'
+