In [3]:
import math
import random
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from collections import namedtuple
from itertools import count
from PIL import Image

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as T

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))

class ReplayMemory(object):
    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, *args):
        """Saves a transition."""
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(*args)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [5]:
class DQN(nn.Module):
    def __init__(self, inputs, outputs):
        super(DQN, self).__init__()
        self.l1 = nn.Linear(inputs, 48)
        self.l2 = nn.Linear(48, 10)
        self.l3 = nn.Linear(10 , outputs)

    def forward(self, x):
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        return self.l3(x)

In [6]:
BATCH_SIZE = 8
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 20
TARGET_UPDATE = 3

inputs = 3
n_actions = 3

policy_net = DQN(inputs, n_actions).to(device)
target_net = DQN(inputs, n_actions).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters())
memory = ReplayMemory(10000)

steps_done = 0


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            return policy_net(state).max(1)[1].view(1, 1)
    else:
        return torch.tensor([[random.randrange(n_actions)]], device=device, dtype=torch.long)


episode_durations = []

In [7]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    batch = Transition(*zip(*transitions))
    non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                          batch.next_state)), device=device, dtype=torch.bool)
    non_final_next_states = torch.cat([s for s in batch.next_state
                                                if s is not None])
    state_batch = torch.cat(batch.state)
    action_batch = torch.cat(batch.action)
    reward_batch = torch.cat(batch.reward)
    state_action_values = policy_net(state_batch).gather(1, action_batch)
    next_state_values = torch.zeros(BATCH_SIZE, device=device)
    next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch
    loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))

    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [8]:
class CryptoEnvironment:
    def __init__(self, frame, clip=0.7):
        self.frame = frame
        self.clip = clip
        self.index = 0
        self.positions = []
        self.lastPosition = {}
        self.realizedProfit = 0
        self.stopLoss = 0.2
        self.positionReward = 0
    
    def getState(self):
        return self.frame.iloc[self.index]
    
    def isEnd(self):
        if self.realizedProfit <= (-1 * self.stopLoss):
            return True
        return self.index + 1 == int(len(self.frame) * self.clip)
    
    def count(self):
        return int(len(self.frame) * self.clip)
    
    def act(self, action):
        if action == 0:
            # hold
            if "type" in self.lastPosition and self.lastPosition["type"] == "Buy":
                current = self.getState()
                self.index += 1
                after = self.getState()
                cP = current["Close"]
                nP = after["Close"]
                reward = (nP - cP) / self.lastPosition["price"]
                self.realizedProfit += reward
                if reward > 0:
                    reward *= 5
                else:
                    reward *= 1
                return reward, self.isEnd()
            else:
                return 0, self.isEnd()
        elif action == 1:
            # buy if balance
            if "type" in self.lastPosition and self.lastPosition["type"] == "Buy":
                return 0, self.isEnd()
            else:
                current = self.getState()
                self.lastPosition = {
                    "type": "Buy",
                    "price": current["Close"]
                }
                self.positions.append(self.lastPosition)
                self.index += 1
                after = self.getState()
                cP = current["Close"]
                nP = after["Close"]
                reward = (nP - cP) / self.lastPosition["price"]
                self.realizedProfit += reward
                if reward > 0:
                    reward *= 5
                else:
                    reward *= 1
                return reward, self.isEnd()
        elif action == 2:
            # sell if has
            if "type" in self.lastPosition and self.lastPosition["type"] == "Sell":
                return 0, self.isEnd()
            elif "type" in self.lastPosition and self.lastPosition["type"] == "Buy":
                current = self.getState()
                buyPrice = self.lastPosition["price"]
                self.lastPosition = {
                    "type": "Sell",
                    "price": current["Close"]
                }
                # reward = (current["Close"] - buyPrice) / buyPrice
                self.positions.append(self.lastPosition)
                return 0, self.isEnd()
        return 0, self.isEnd()

In [9]:
import pandas as pd
data = pd.read_csv("dotusdt-1m-20000.csv")
cols = ["Close","Volume","Number-of-trades"]
data = data[cols]

In [13]:
# train_portion = 0.7
# dataTrain = data[:int(len(data) * train_portion)]
# dataTest = data[int(len(data) * train_portion):]
dataTrain = data[16000:19000]
dataTest = data[19000:]

In [11]:
from tqdm import tqdm

In [14]:
num_episodes = 20
dfList = np.array_split(dataTrain, num_episodes)

def s2t(st):
    return torch.tensor(st.to_list()).to(device).reshape(1, -1).float()

for epoch in tqdm(range(100)):
    np.random.shuffle(dfList)
    profit = 0
    for i_episode in range(num_episodes):
        env = CryptoEnvironment(dfList[i_episode], clip=1)
        state =  s2t(env.getState())
        for t in range(env.count()):
            action = select_action(state)
            reward, done = env.act(action.item())
            reward = torch.tensor([reward], device=device).float()

            if not done:
                next_state = s2t(env.getState())
            else:
                next_state = None
            memory.push(state, action, next_state, reward)
            state = next_state
            optimize_model()
            if done:
                break
        profit += env.realizedProfit
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
    print("epoch {} profit: {} mean: {}".format(epoch, profit, profit/num_episodes))

  1%|          | 1/100 [00:09<15:41,  9.51s/it]epoch 0 profit: 0.001872643837767446 mean: 9.36321918883723e-05
  2%|▏         | 2/100 [00:18<15:22,  9.41s/it]epoch 1 profit: -0.019741383797026814 mean: -0.0009870691898513407
  3%|▎         | 3/100 [00:27<15:05,  9.33s/it]epoch 2 profit: 0.014682761151196787 mean: 0.0007341380575598393
  4%|▍         | 4/100 [00:36<14:48,  9.26s/it]epoch 3 profit: 0.025149851987222114 mean: 0.0012574925993611057
  5%|▌         | 5/100 [00:45<14:28,  9.15s/it]epoch 4 profit: 0.03932786543458307 mean: 0.0019663932717291534
  6%|▌         | 6/100 [00:54<14:02,  8.96s/it]epoch 5 profit: 0.04040120301979643 mean: 0.0020200601509898215
  7%|▋         | 7/100 [01:02<13:43,  8.85s/it]epoch 6 profit: 0.016875597151975633 mean: 0.0008437798575987816
  8%|▊         | 8/100 [01:12<13:42,  8.94s/it]epoch 7 profit: 0.060197842603334654 mean: 0.0030098921301667326
  9%|▉         | 9/100 [01:20<13:32,  8.93s/it]epoch 8 profit: 0.02259726443181978 mean: 0.00112986322159

In [17]:
def select_action2(state):
    with torch.no_grad():
        return target_net(state).max(1)[1].view(1, 1)

evalEnv = CryptoEnvironment(dataTest, clip=1)
state = s2t(evalEnv.getState())
acts = []
for t in tqdm(range(evalEnv.count())):
    action = select_action2(state)
    acts.append(action.item())
    reward, done = evalEnv.act(action.item())
    if not done:
        next_state = s2t(evalEnv.getState())
    else:
        next_state = None
    state = next_state
    if done:
        break

100%|██████████| 1000/1000 [00:00<00:00, 1522.55it/s]
