In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

from collections import deque
from tqdm import tqdm

In [2]:
class DQN(nn.Module):
    def __init__(self, lr=3e-4, input_dims=None, 
                 fc1_dims=256, fc2_dims=256, 
                 n_actions=3):
        super(DQN, self).__init__()
        
        self.lr = lr
        self.input_dims = input_dims
        self.fc1_dims = fc1_dims
        self.fc2_dims = fc2_dims
        self.n_actions = n_actions
        
        self.net = nn.Sequential(
            nn.Linear(self.input_dims, self.fc1_dims), nn.ReLU(),
            nn.Linear(self.fc1_dims, self.fc2_dims), nn.ReLU(),
            nn.Linear(self.fc2_dims, self.n_actions)
        )
        
        self.optimizer = optim.Adam(params=self.parameters(), lr=lr)
        self.loss = nn.MSELoss()
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)
        
    def forward(self, observation):
        return self.net(observation)
    

In [3]:
class ReplayMemory(object):
    def __init__(self, max_mem_size, input_dims):
        self.mem_cntr = 0
        self.mem_size = max_mem_size
        self.input_dims = input_dims
        
        self.state_memory = np.zeros((max_mem_size, input_dims), 
                                     dtype=np.float32)
        self.new_state_memory = np.zeros((max_mem_size, input_dims), 
                                         dtype=np.float32)
        self.action_memory = np.zeros(max_mem_size, 
                                      dtype=np.int32)
        self.reward_memory = np.zeros(max_mem_size, 
                                      dtype=np.float32)
        self.terminal_memory = np.zeros(max_mem_size, 
                                        dtype=np.int32)
        
    def store_transition(self, state, action, reward, 
                         state_, terminal):
        idx = self.mem_cntr % self.mem_size
        self.state_memory[idx] = state
        self.new_state_memory[idx] = state_
        self.reward_memory[idx] = reward
        self.action_memory[idx] = action
        self.terminal_memory[idx] = terminal
        
        self.mem_cntr += 1
        

In [4]:
class Agent(object):
    def __init__(self, gamma, epsilon, lr, input_dims, batch_size,
                n_actions, max_mem_size=100000,
                eps_end=0.05, eps_dec=5e-4):
        self.gamma = gamma
        self.epsilon = epsilon
        self.lr = lr
        self.input_dims = input_dims
        self.batch_size = batch_size
        self.n_actions = n_actions
        self.mem_size = max_mem_size
        self.eps_min = eps_end
        self.eps_dec = eps_dec
        
        self.iter_cntr = 0
        
        self.action_space = [i for i in range(self.n_actions)]
        
        self.Q_eval = DQN(lr=lr, n_actions=n_actions,
                         input_dims=input_dims,
                         fc1_dims=256, fc2_dims=256)
        self.memory = ReplayMemory(max_mem_size=max_mem_size,
                                  input_dims=input_dims)
        self.store_transition = self.memory.store_transition
        
    def take_action(self, observation):
        # using epsilon_greedy strategy
        if np.random.random() > self.epsilon:
            # exploit
            state = torch.tensor(np.array([observation], dtype=np.float32)).to(self.Q_eval.device)
            
            actions = self.Q_eval.forward(state)
            action = torch.argmax(actions).item()
        else:
            # explore
            action = np.random.choice(self.action_space)
        
        return action
    
    def train(self):
        # make sure there is enough experience in memo buffer
        if self.memory.mem_cntr < self.batch_size:
            return
        
        self.Q_eval.optimizer.zero_grad()
        
        max_mem = min(self.memory.mem_size, self.memory.mem_cntr)
        
        batch = np.random.choice(max_mem, self.batch_size, replace=False)
        batch_idx = np.arange(self.batch_size, dtype=np.int32)
        
        state_batch = torch.tensor(self.memory.state_memory[batch]
                                  ).to(self.Q_eval.device)
        new_state_batch = torch.tensor(self.memory.new_state_memory[batch]
                                      ).to(self.Q_eval.device)        
        action_batch = self.memory.action_memory[batch]     
        reward_batch = torch.tensor(self.memory.reward_memory[batch]
                                   ).to(self.Q_eval.device)        
        terminal_batch = torch.tensor(self.memory.terminal_memory[batch]
                                     ).long().to(self.Q_eval.device)  
        
        q_eval = self.Q_eval.forward(state_batch)[batch_idx, action_batch]
        q_next = self.Q_eval.forward(new_state_batch)
        q_next[terminal_batch] = 0
        
        q_target = reward_batch + self.gamma * torch.max(q_next, dim=1)[0]
        
        loss = self.Q_eval.loss(q_target, q_eval).to(self.Q_eval.device)
        loss.backward()
        self.Q_eval.optimizer.step()
        
        self.iter_cntr += 1
        # epsilon decay
        self.epsilon = self.epsilon - self.eps_dec \
            if self.epsilon > self.eps_min else self.eps_min

In [5]:
import sys
import os

sys.path.append('../../')

In [6]:
from crypto_env.dataloader.ethloader import ETHLoader
from crypto_env.algorithm import Algorithm
from crypto_env import CryptoEnv, Recorder, Visualizer

In [7]:
class ENV(CryptoEnv):
    def get_reward(self):
        is_done = self._is_done
        
        if not is_done:
            return 0
        else:
            # process done. give reward acoording to ROI
            roi = self.recorder.get_roi()
            return roi * 10

In [8]:
torch.manual_seed(0)

features = ['PriceUSD', 'AdrActCnt', 'AdrBal1in100KCnt']
loader = ETHLoader(base_dir=os.getcwd(), features=features,
                  start_idx=100, end_idx=200, dropna=True, download=True)
recorder = Recorder(price_list=loader.get_feature('PriceUSD'))
transaction_fee = [0.01] * len(loader)
loader.load_transaction_fee(transaction_fee, 'percentage')

KeyboardInterrupt: 

In [None]:
env = ENV(max_sell=10, max_buy=10, min_sell=0, min_buy=0, 
    dataloader=loader, recorder=recorder).reset()

agent = Agent(gamma=0.99, epsilon=1.0, batch_size=32, n_actions=3, 
             eps_end=0.01, input_dims=3, lr=3e-4)

HISTORY_DEQUE_SIZE = 2
n_runs = 500

In [None]:
print(agent.Q_eval)

DQN(
  (net): Sequential(
    (0): Linear(in_features=3, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=256, bias=True)
    (3): ReLU()
    (4): Linear(in_features=256, out_features=3, bias=True)
  )
  (loss): MSELoss()
)


Action References:

| Action Index | Description         |
|--------------|---------------------|
| 0            | Buy 1 Ether         |
| 1            | Sell 1 Ether        |
| 2            | Hold and do nothing |

In [None]:
rewards = []

# for i in tqdm(range(n_runs)):
for i in tqdm(range(n_runs)):
    env.reset()
    observation = env.first_observation()['features']
    done = False
    score = 0
    
    while not done:
        
        action = agent.take_action(observation)
        observation_, reward, done, info = None, None, None, None 
        if action == 0:
            observation_, reward, done, info = env.buy(1)
        if action == 1:
            observation_, reward, done, info = env.sell(1)
        if action == 2:
            observation_, reward, done, info = env.hold()
        
        observation_ = observation_['features']
        
        agent.store_transition(observation, action,
                            reward, observation_, done)
        agent.train()
    
    rewards.append(env.recorder.get_roi())

 79%|████████████████████████████████▍        | 396/500 [01:42<00:31,  3.25it/s]

In [None]:
import matplotlib.pyplot as plt

MA_WINDOW = 50

In [None]:
# calculate moving average
rewards = np.array(rewards, dtype=np.float32)
rewards_ma = []
for i in range(len(rewards) - MA_WINDOW):
    base_idx = i - MA_WINDOW
    base_idx = base_idx if base_idx > 0 else 0
    ceil_idx = i + 1
    rewards_ma.append(rewards[base_idx:ceil_idx].mean())

In [None]:
plt.figure(dpi=100)

plt.plot(rewards[-100:-1], label='reward')
plt.plot(rewards_ma[-100:-1], label='moving average (window=50)')

plt.grid()
plt.legend()

plt.show()