In [45]:
import pandas as pd
import numpy as np
import random
import torch
from torch import nn
from collections import *
from sklearn.preprocessing import StandardScaler
import yfinance as yf
from collections import deque

In [46]:
STATE_SPACE = 28
ACTION_SPACE = 3

ACTION_LOW = -1
ACTION_HIGH = 1

GAMMA = 0.9995
TAU = 1e-3
EPS_START = 1.0
EPS_END = 0.1
EPS_DECAY = 0.9

MEMORY_LEN = 10000
MEMORY_THRESH = 500
BATCH_SIZE = 200

LR_DQN = 5e-4

LEARN_AFTER = MEMORY_THRESH
LEARN_EVERY = 5
UPDATE_EVERY = 9

COST = 3e-4
CAPITAL = 100000
NEG_MUL = 2

In [47]:
## Py torch

# DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
DEVICE = torch.device("cpu")

EPOCHS = 100
BACTH_SIZE = 64

# Define the learning rate
LR = 5e-3

# Define the discount factor
GAMMA = 0.99

# Memory capacity
MEM_CAP = 10000
MEM_MIN = 500 # learn after having this amount of memory

ACTIONS = [-1, 0, 1]
LEN_ACTIONS = len(ACTIONS)
INPUT_DIM = 28

In [48]:
class NN_DuellingDQN(nn.Module):
    def __init__(self):
        super(NN_DuellingDQN, self).__init__()
        self.input_dim = INPUT_DIM
        self.output_dim = LEN_ACTIONS

        self.l1 = nn.Linear(self.input_dim, 500)
        self.l2 = nn.Linear(500, 500)
        self.l3 = nn.Linear(500, 300)
        self.l4 = nn.Linear(300, 200)
        self.l5 = nn.Linear(200 10)

        self.ls = nn.Linear(10 1)
        self.lp = nn.Linear(10, self.output_dim)

        self.relu = nn.ReLU()

    def forward(self, state):
        x = self.relu(self.l1(state))
        x = self.relu(self.l2(x))
        x = self.relu(self.l3(x))
        x = self.relu(self.l4(x))
        x = self.relu(self.l5(x))
        xs = self.relu(self.ls(x))
        xp = self.relu(self.lp(x))

        x = xs + xp - xp.mean()
        return x

In [49]:
class ShortMemory:
    def __init__(self, capacity=MEM_CAP):
        self.memory = deque(maxlen=capacity) # queue to remove older mem cells

    def store(self, output : tuple): # contains everything from env output
        self.memory.append(output)
        # state, new_state, action, reward, done

    def sample(self, n):
        output = random.sample(self.memory, n)
        return output

    def __len__(self):
        return len(self.memory)

In [50]:
class Agent:
    def __init__(self):
        self.nnql = NN_DuellingDQN().to(DEVICE) # local
        self.nnqt = NN_DuellingDQN().to(DEVICE) # target
        self.nnqt.load_state_dict(self.nnql.state_dict())
        self.nnqt.eval()

        self.memory = ShortMemory()

        self.criterion = nn.MSELoss()
        self.optim = torch.optim.Adam(self.nnql.parameters(), lr=LR)

        self.epsilon = 1.0
        self.step = 0

    def inference_step(self, state): # forward pass
        self.nnql.eval() # don't update nn
        with torch.no_grad(): # don't update gradients
            actions = self.nnql(state)
        self.nnql.train() # THEN update => forward
        return actions

    def epsilon_greedy_pol(self, state, test=False) -> int:
        if not test:
            self.step += 1
        state = torch.from_numpy(state).float().to(DEVICE).view(1, -1)

        actions = self.inference_step(state)
        if random.random() > self.epsilon or test == True:
            act = np.argmax(actions.cpu().data.numpy())
        else:
            act = random.choice(np.arange(LEN_ACTIONS))
        return ACTIONS[int(act)]

    def calc_bellman(self, states, next_states, actions, rewards, dones) -> tuple:
        next_state_values = self.nnqt(next_states).max(1)[0].unsqueeze(1)
        y = rewards + (1-dones) * GAMMA * next_state_values
        state_values = self.nnql(states).gather(1, actions.type(torch.int64).add(1))
        return y, state_values

    def learn(self):
        if len(self.memory) <= MEM_MIN:
            return

        if self.step % LEARN_EVERY != 0:
            return

        batch = self.memory.sample(BACTH_SIZE)

        states = np.vstack([t[0] for t in batch])
        states = torch.from_numpy(states).float().to(DEVICE)

        next_states = np.vstack([t[1] for t in batch])
        next_states = torch.from_numpy(next_states).float().to(DEVICE)

        actions = np.vstack([t[2] for t in batch])
        actions = torch.from_numpy(actions).float().to(DEVICE)

        rewards = np.vstack([t[3] for t in batch])
        rewards = torch.from_numpy(rewards).float().to(DEVICE)

        dones = np.vstack([t[4] for t in batch]).astype(np.uint8)
        dones = torch.from_numpy(dones).float().to(DEVICE)

        # Bellman :
        y, state_values = self.calc_bellman(states, next_states, actions, rewards, dones)

        loss = self.criterion(y, state_values)
        self.optim.zero_grad()
        loss.backward()
        self.optim.step()

        if self.step % UPDATE_EVERY == 0:
            self.update_param()

    def update_param(self):
        for target_param, local_param in zip(self.nnqt.parameters(), self.nnql.parameters()):
            target_param.data.copy_(LR * local_param.data + (1.0 - LR) * target_param.data)

    def train(self, env, test_env, epochs : int) -> list:
        scores = []
        train_values = []
        test_values = []
        eps = EPS_START
        self.step = 0
        for e in range(epochs):
            print("epochs nb:", e, "epsilon:", self.epsilon)
            episode_score = 0
            episode_score2 = 0
            test_score = 0
            test_score2 = 0
            score = 0
            state = env.reset()
            state = state.reshape(-1, INPUT_DIM)
            while True:
                action = self.epsilon_greedy_pol(state)
                next_state, reward, done, _ = env.step(action)
                next_state = next_state.reshape(-1, INPUT_DIM)

                self.memory.store((state, next_state, action, reward, done))
                self.learn()

                state = next_state
                score += reward
                if done:
                    break

            self.epsilon = max(self.epsilon * 0.9, 0.1)

            episode_score += score
            episode_score2 += (env.store['running_capital'][-1] - env.store['running_capital'][0])
            scores.append(score)

            state = test_env.reset()
            done = False
            score_te = 0
            scores_te = [score_te]
            while True:
                action = self.epsilon_greedy_pol(state)
                next_state, reward, done, _ = test_env.step(action)
                next_state = next_state.reshape(-1, STATE_SPACE)
                state= next_state
                score_te += reward
                scores_te.append(score_te)
                if done:
                    break

            test_score += score_te
            test_score2 += (test_env.store['running_capital'][-1] - test_env.store['running_capital'][0])

            print(f"Episode: {e}, Train Score: {episode_score:.5f}, Validation Score: {test_score:.5f}")
            print(f"Episode: {e}, Train Value: ${episode_score2:.5f}, Validation Value: ${test_score2:.5f}", "\n")
            train_values.append(episode_score2)
            test_values.append(test_score2)

        df = pd.DataFrame(list(zip(env.store['action_store'], env.store['running_capital'], env.store['current_price'])),
                                   columns =['Actions', 'Capital', 'Price'])
        df.to_csv("./out.csv", index=False)
        df_perf = pd.DataFrame(list(zip(train_values, test_values)), columns=['Train', 'Test'])
        df_perf.to_csv("./perf.csv")

        return scores

In [51]:
class DataGetter:
    """
    The class for getting data for assets.
    """

    def __init__(self, asset="BTC-USD", start_date=None, end_date=None, freq="1d",
                timeframes=[1, 2, 5, 10, 20, 40]):
        self.asset = asset
        self.sd = start_date
        self.ed = end_date
        self.freq = freq

        self.timeframes = timeframes
        self.getData()

        self.scaler = StandardScaler()
        self.scaler.fit(self.data[:, 1:])


    def getData(self):
        asset = self.asset
        if self.sd is not None and self.ed is not None:
            df =  yf.download([asset], start=self.sd, end=self.ed, interval=self.freq)
            df_spy = yf.download(["BTC-USD"], start=self.sd, end=self.ed, interval=self.freq)
        elif self.sd is None and self.ed is not None:
            df =  yf.download([asset], end=self.ed, interval=self.freq)
            df_spy = yf.download(["BTC-USD"], end=self.ed, interval=self.freq)
        elif self.sd is not None and self.ed is None:
            df =  yf.download([asset], start=self.sd, interval=self.freq)
            df_spy = yf.download(["BTC-USD"], start=self.sd, interval=self.freq)
        else:
            df = yf.download([asset], period="max", interval=self.freq)
            df_spy = yf.download(["BTC-USD"], interval=self.freq)

        # Reward - Not included in Observation Space.
        df["rf"] = df["Adj Close"].pct_change().shift(-1)

        # Returns and Trading Volume Changes
        for i in self.timeframes:
            df_spy[f"spy_ret-{i}"] = df_spy["Adj Close"].pct_change(i)
            df_spy[f"spy_v-{i}"] = df_spy["Volume"].pct_change(i)

            df[f"r-{i}"] = df["Adj Close"].pct_change(i)
            df[f"v-{i}"] = df["Volume"].pct_change(i)

        # Volatility
        for i in [5, 10, 20, 40]:
            df[f'sig-{i}'] = np.log(1 + df["r-1"]).rolling(i).std()

        # Moving Average Convergence Divergence (MACD)
        df["macd_lmw"] = df["r-1"].ewm(span=26, adjust=False).mean()
        df["macd_smw"] = df["r-1"].ewm(span=12, adjust=False).mean()
        df["macd_bl"] = df["r-1"].ewm(span=9, adjust=False).mean()
        df["macd"] = df["macd_smw"] - df["macd_lmw"]

        # Relative Strength Indicator (RSI)
        rsi_lb = 5
        pos_gain = df["r-1"].where(df["r-1"] > 0, 0).ewm(rsi_lb).mean()
        neg_gain = df["r-1"].where(df["r-1"] < 0, 0).ewm(rsi_lb).mean()
        rs = np.abs(pos_gain/neg_gain)
        df["rsi"] = 100 * rs/(1 + rs)

        # Bollinger Bands
        bollinger_lback = 10
        df["bollinger"] = df["r-1"].ewm(bollinger_lback).mean()
        df["low_bollinger"] = df["bollinger"] - 2 * df["r-1"].rolling(bollinger_lback).std()
        df["high_bollinger"] = df["bollinger"] + 2 * df["r-1"].rolling(bollinger_lback).std()

        # Filtering
        for c in df.columns:
            df[c].interpolate('linear', limit_direction='both', inplace=True)
        df.replace([np.inf, -np.inf], np.nan, inplace=True)
        df.dropna(inplace=True)

        df.to_csv('./prices.csv')

        self.frame = df
        self.data = np.array(df.iloc[:, 6:])
        return

    def scaleData(self):
        self.scaled_data = self.scaler.fit_transform(self.data[:, 1:])
        return

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx, col_idx=None):
        if col_idx is None:
            return self.data[idx]
        elif col_idx < len(list(self.data.columns)):
            return self.data[idx][col_idx]
        else:
            raise IndexError


In [52]:
class SingleAssetTradingEnvironment:
    """
    Trading Environment for trading a single asset.
    The Agent interacts with the environment class through the step() function.
    Action Space: {-1: Sell, 0: Do Nothing, 1: Buy}
    """

    def __init__(self, asset_data,
               initial_money=CAPITAL, trans_cost=COST, store_flag=1, asset_ph=0,
               capital_frac=0.2, running_thresh=0.1, cap_thresh=0.3):

        self.past_holding = asset_ph
        self.capital_frac = capital_frac # Fraction of capital to invest each time.
        self.cap_thresh = cap_thresh
        self.running_thresh = running_thresh
        self.trans_cost = trans_cost

        self.asset_data = asset_data
        self.terminal_idx = len(self.asset_data) - 1
        self.scaler = self.asset_data.scaler

        self.initial_cap = initial_money

        self.capital = self.initial_cap
        self.running_capital = self.capital
        self.asset_inv = self.past_holding

        self.pointer = 0
        self.next_return, self.current_state = 0, None
        self.prev_act = 0
        self.current_act = 0
        self.current_reward = 0
        self.current_price = self.asset_data.frame.iloc[self.pointer, :]['Adj Close']
        self.done = False

        self.store_flag = store_flag
        if self.store_flag == 1:
            self.store = {"action_store": [],
                        "reward_store": [],
                        "running_capital": [],
                        "current_price": [],
                        "port_ret": []}

    def reset(self):
        self.capital = self.initial_cap
        self.running_capital = self.capital
        self.asset_inv = self.past_holding

        self.pointer = 0
        self.next_return, self.current_state = self.get_state(self.pointer)
        self.prev_act = 0
        self.current_act = 0
        self.current_reward = 0
        self.current_price = self.asset_data.frame.iloc[self.pointer, :]['Adj Close']
        self.done = False

        if self.store_flag == 1:
            self.store = {"action_store": [],
                        "reward_store": [],
                        "running_capital": [],
                        "current_price": [],
                        "port_ret": []}

        return self.current_state

    def step(self, action):
        self.current_act = action
        self.current_price = self.asset_data.frame.iloc[self.pointer, :]['Adj Close']
        self.current_reward = self.calculate_reward()
        self.prev_act = self.current_act
        self.pointer += 1
        self.next_return, self.current_state = self.get_state(self.pointer)
        self.done = self.check_terminal()

        if self.done:
            reward_offset = 0
            ret = (self.store['running_capital'][-1]/self.store['running_capital'][-0]) - 1
            if self.pointer < self.terminal_idx:
                reward_offset += -1 * max(0.5, 1 - self.pointer/self.terminal_idx)
            if self.store_flag:
                reward_offset += 10 * ret
            self.current_reward += reward_offset

        if self.store_flag:
            self.store["action_store"].append(self.current_act)
            self.store["reward_store"].append(self.current_reward)
            self.store["running_capital"].append(self.capital)
            self.store["current_price"].append(self.current_price)
            info = self.store
        else:
            info = None

        return self.current_state, self.current_reward, self.done, info


    def calculate_reward(self):
        investment = self.running_capital * self.capital_frac
        reward_offset = 0

        # Buy Action
        if self.current_act == 1:
            if self.running_capital > self.initial_cap * self.running_thresh:
                self.running_capital -= investment
                asset_units = investment/self.current_price
                self.asset_inv += asset_units
                self.current_price *= (1 - self.trans_cost)

        # Sell Action
        elif self.current_act == -1:
            if self.asset_inv > 0:
                self.running_capital += self.asset_inv * self.current_price * (1 - self.trans_cost)
                self.asset_inv = 0

        # Do Nothing
        elif self.current_act == 0:
            if self.prev_act == 0:
                reward_offset += -0.1
            pass

        # Reward to give
        prev_cap = self.capital
        self.capital = self.running_capital + (self.asset_inv) * self.current_price
        reward = 100*(self.next_return) * self.current_act - np.abs(self.current_act - self.prev_act) * self.trans_cost
        if self.store_flag==1:
            self.store['port_ret'].append((self.capital - prev_cap)/prev_cap)

        if reward < 0:
            reward *= NEG_MUL  # To make the Agent more risk averse towards negative returns.
        reward += reward_offset

        return reward


    def check_terminal(self):
        if self.pointer == self.terminal_idx:
            return True
        elif self.capital <= self.initial_cap * self.cap_thresh:
            return True
        else:
            return False


    def get_state(self, idx):
        state = self.asset_data[idx][1:]
        state = self.scaler.transform(state.reshape(1, -1))

        state = np.concatenate([state, [[self.capital/self.initial_cap,
                                        self.running_capital/self.capital,
                                        self.asset_inv * self.current_price/self.initial_cap,
                                        self.prev_act]]], axis=-1)

        next_ret = self.asset_data[idx][0]
        return next_ret, state


In [53]:
Transition = namedtuple("Transition", ["States", "Actions", "Rewards", "NextStates", "Dones"])


class ReplayMemory:
    """
    Implementation of Agent memory
    """
    def __init__(self, capacity=MEMORY_LEN):
        self.memory = deque(maxlen=capacity)

    def store(self, t):
        self.memory.append(t)

    def sample(self, n):
        a = random.sample(self.memory, n)
        return a

    def __len__(self):
        return len(self.memory)


In [54]:
# Environment and Agent Initiation

## Cryptocurrency Tickers
asset_code = "ETH-USD"

## Training and Testing Environments
assets = DataGetter(asset_code, start_date="2015-01-01", end_date="2021-10-01")
test_assets = DataGetter(asset_code, start_date="2021-10-01", end_date="2022-05-01", freq="1d")
env = SingleAssetTradingEnvironment(assets)
test_env = SingleAssetTradingEnvironment(test_assets)

## Agent
memory = ReplayMemory()
agent = Agent()

# Main training loop
N_EPISODES = 30 # No of episodes/epochs

agent.train(env, test_env, int(N_EPISODES))

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
epochs nb: 0 epsilon: 1.0
Episode: 0, Train Score: -2002.18831, Validation Score: -289.90040
Episode: 0, Train Value: $-30614.72360, Validation Value: $-17833.00067 

epochs nb: 1 epsilon: 0.9
Episode: 1, Train Score: -1776.69489, Validation Score: -314.04708
Episode: 1, Train Value: $176674.18744, Validation Value: $-5198.95317 

epochs nb: 2 epsilon: 0.81
Episode: 2, Train Score: -1299.40595, Validation Score: -111.80590
Episode: 2, Train Value: $245065.71533, Validation Value: $-888.98308 

epochs nb: 3 epsilon: 0.7290000000000001
Episode: 3, Train Score: -1749.00052, Validation Score: -184.25716
Episode: 3, Train Value: $47578.33506, Validation Value: $-3490.46129 

epochs nb: 4 epsilon: 0.6561000000000001
Epis

[-2002.1883126329099,
 -1776.6948903998948,
 -1299.405948515509,
 -1749.0005199167015,
 -888.3006643196677,
 -862.0449132472559,
 -896.0363783507147,
 -528.349446372211,
 -349.45517420990154,
 87.43637452375623,
 1300.982973107973,
 1023.0477413626594,
 1424.8258191116072,
 1066.2658746891125,
 274.51106638286694,
 453.30684509233794,
 1542.3892333438287,
 1566.4324207979937,
 2092.1203654244086,
 2457.0768052363255,
 2127.3182615016885,
 2109.9715158644112,
 1211.6869109422023,
 1962.5569308996432,
 1654.5773846450895,
 1402.314811555477,
 2925.580035397481,
 272.6328749715848,
 269.0380623522531,
 396.5267697825386]