In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import lightgbm as ltb
from sklearn.metrics import accuracy_score

### Read all the data

In [2]:
work_path = os.path.abspath(os.path.dirname(os.getcwd()))

In [3]:
price_dji = pd.read_csv(work_path+"/Data/Dow Jones Industrial Average Historical Data - Daily.csv")[["Date", "Price"]]
price_hs = pd.read_csv(work_path+"/Data/Hang Seng Historical Data - Daily.csv")[["Date", "Price"]]
price_sp500 = pd.read_csv(work_path+"/Data/S&P 500 Historical Data - Daily.csv")[["Date", "Price"]]
price_sh = pd.read_csv(work_path+"/Data/Shanghai Composite Historical Data - Daily.csv")[["Date", "Price"]]
price_szse = pd.read_csv(work_path+"/Data/SZSE Component Historical Data - Daily.csv")[["Date", "Price"]]

### m-eSAX method

In [4]:
def triplet_function(a, b, c, a_t, b_t, c_t):
    # Output a, b, c in the order of a_t, b_t, c_t
    if a_t > b_t > c_t:
        return [a, b, c]
    elif a_t > c_t > b_t:
        return [a, c, b]
    elif b_t > a_t > c_t:
        return [b, a, c]
    elif b_t > c_t > a_t:
        return [b, c, a]
    elif c_t > b_t > a_t:
        return [c, b, a]
    elif c_t > a_t > b_t:
        return [c, a, b]
    else:
        return [a, b, c]
    
def uniform_symbol(triplet, x_min, x_max, K):
    # triplet -> symbol, here the breakpoints is the uniform breakpoints fixed.
    diff = x_max - x_min
    a, b, c = triplet[0], triplet[1], triplet[2]
    return [int((a-x_min)*K/diff), int((b-x_min)*K/diff), int((c-x_min)*K/diff)]

def given_symbol(triplet, K_array):
    # triplet -> symbol, with the K_array as the set of the breakpoints.
    a, b, c = triplet[0], triplet[1], triplet[2]
    np.sum(np.where(a - K_array>=0, 1, 0))
    return [np.sum(np.where(a - K_array>=0, 1, 0)), 
            np.sum(np.where(b - K_array>=0, 1, 0)),
            np.sum(np.where(c - K_array>=0, 1, 0))]


class m_eSAX():
    def __init__(self, window_size, R, data, K) -> None:
        assert window_size%R == 0
        self.window_size = window_size
        self.R = R # Length of the group
        self.data = data.copy()
        self.c = int(window_size/R) # Number of the groups in each window
        self.K = np.power(2, K) # Number of the breakpoints

    def preprocess(self):
        # data cleaning
        length = len(self.data.index)
        self.data.columns = self.data.columns.map(lambda x: x.lower())
        self.data.loc[:, "price"] = self.data["price"].map(lambda x: x.replace(",", ""))
        self.data.loc[:, "price"] = self.data["price"].astype("float")

        self.start_time = self.data["date"][length-1]
        self.data["time"] = self.data.index

    def add_window_group(self):
        # Divide the window and the group
        length = len(self.data.index)
        window_num = int(length/self.window_size) + 1
        group_num = self.c + 1
        group = np.array([i for i in range(int(group_num*window_num))]).repeat(self.R)[:length]
        window = np.array([i for i in range(int(window_num))]).repeat(self.window_size)[:length]
        self.data.loc[:, "window"] = window
        self.data.loc[:, "group"] = group
        self.data.loc[:, "index"] = self.data.index
        self.data.loc[:, "diff"] = self.data["price"].diff()[1:].reset_index().drop(columns=["index"])

        return self.data
    
    def group_info(self):
        # Compute the min_price, max_price, mean_price, and the corresponding time for each group
        self.group_res = self.data.groupby(by="group").agg({"time": ["mean", "max"], "price": ["mean", "min", "max"], "window":["min"]})
        self.group_res["argmin"] = self.data.groupby('group')['price'].idxmin()
        self.group_res["argmax"] = self.data.groupby('group')['price'].idxmax()
        self.group_res.columns = ["time_mean", "time_end", "mean", "min", "max", "window", "argmin", "argmax"]

        self.group_res = self.group_res.merge(self.data[["index", "time"]], how="left", left_on="argmin", right_on="index")
        self.group_res = self.group_res.merge(self.data[["index", "time"]], how="left", left_on="argmax", right_on="index", suffixes=["_min", "_max"])
        self.group_res = self.group_res.drop(columns=["index_min", "index_max", "argmin", "argmax"])

    def group_triplet(self):
        # Output the triplet with the correct order
        self.group_res["triplet"] = self.group_res.apply(lambda x: triplet_function(
            x["mean"], x["min"], x["max"], x["time_mean"], x["time_min"], x["time_max"]
        ), axis=1)

    def get_triplet(self):
        # Pipeline complet for getting the triplet
        self.preprocess()
        self.add_window_group()
        self.group_info()
        self.group_triplet()

    def uniform_breakpoint(self):
        # Get the symbols with uniform breakpoints
        x_min = self.data["price"].min()
        x_max = self.data["price"].max()
        self.group_res["uniform_symbol"] = self.group_res.apply(lambda x: uniform_symbol(
            x["triplet"], x_min, x_max, self.K), axis=1)

    def given_breakpoint(self, K_array):
        # Get the symbols with K_array as the breakpoints
        self.group_res["uniform_symbol"] = self.group_res.apply(lambda x: given_symbol(
            x["triplet"], K_array), axis=1)

In [5]:
def dataset(price, window_size=50, R=10, K=5):
    # Get the symbol representation for each window and corresponding label for the up-down binary classification.
    y = []
    X = []
    for i in range(window_size):

        price_use = price.copy()[i:].reset_index().drop(columns=["index"]) # Rolling the data to get more window.
        model = m_eSAX(window_size, R, price_use, K)
        model.get_triplet()
        model.uniform_breakpoint() # Get the symbols with uniform breakpoints for each group.
        represenation = model.group_res[["window", "uniform_symbol"]].groupby("window").agg({"uniform_symbol": ["sum"]})
        # Get the symbols with uniform breakpoints for each window by connecting each group's representation.
        represenation.columns = ["represenation"]
        represenation.index.name = ""
        represenation["window"] =  represenation.index
        diff = model.data.groupby("window").apply(lambda x: x.iloc[-1])
        diff.index.name = ""
        diff = diff.reset_index()
        represenation = represenation.merge(diff[["window", "diff"]], on="window")
        represenation["label"] = represenation["diff"].apply(lambda x: 1 if x>=0 else 0)
        # Get the right label, if it goes up the next day it's 1 and vice versa it's 0.
        represenation = represenation[:-1]
        X  = X + np.array(represenation["represenation"]).tolist()
        y  = y + np.array(represenation["label"]).tolist()

    return np.array(X), np.array(y)

In [6]:
def test_accuracy_uniforme(price, name):
    price = price[::-1].reset_index().drop(columns=["index"])
    test_size = 300
    price_train = price[:-test_size]
    price_test = price[-test_size:]
    
    X_train, y_train = dataset(price_train)
    X_test, y_test = dataset(price_test)
    
    classifier = ltb.LGBMClassifier(random_state=123) 
    classifier.fit(X_train, y_train)
    # Input feature is the representation for each window.
    # Model is a LightGBM classifier
    y_predict = classifier.predict(X_test)
    print(f"The accuracy score for the {name} test set is {accuracy_score(y_predict, y_test)}\n")

In [7]:
test_accuracy_uniforme(price_dji, "Dow Jones")
test_accuracy_uniforme(price_hs, "Hang Seng")
test_accuracy_uniforme(price_sp500, "S&P 500")
test_accuracy_uniforme(price_sh, "Shanghai Commposite")
test_accuracy_uniforme(price_szse, "SZSE Component")

[LightGBM] [Info] Number of positive: 492, number of negative: 417
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000432 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 909, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.541254 -> initscore=0.165392
[LightGBM] [Info] Start training from score 0.165392
The accuracy score for the Dow Jones test set is 0.536

[LightGBM] [Info] Number of positive: 444, number of negative: 440
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000295 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 433
[LightGBM] [Info] Number of data points in the train set: 884, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.502262 -> initscore=0.009050
[LightGBM] [Info] Start tra

In [8]:
def dataset_learningarray(price, K_array, window_size=50, R=10, K=5):
    # Get the symbol representation for each window and corresponding label for the up-down binary classification.
    y = []
    X = []
    for i in range(window_size):

        price_use = price.copy()[i:].reset_index().drop(columns=["index"])
        model = m_eSAX(window_size, R, price_use, K)
        model.get_triplet()
        model.given_breakpoint(K_array) # Get the symbols with K_arrat as the breakpoints.
        represenation = model.group_res[["window", "uniform_symbol"]].groupby("window").agg({"uniform_symbol": ["sum"]})
        # Get the symbols with uniform breakpoints for each window by connecting each group's representation.
        represenation.columns = ["represenation"]
        represenation.index.name = ""
        represenation["window"] =  represenation.index
        diff = model.data.groupby("window").apply(lambda x: x.iloc[-1])
        diff.index.name = ""
        diff = diff.reset_index()
        represenation = represenation.merge(diff[["window", "diff"]], on="window")
        represenation["label"] = represenation["diff"].apply(lambda x: 1 if x>=0 else 0)
        represenation = represenation[:-1]
        # Get the right label, if it goes up the next day it's 1 and vice versa it's 0.
        X  = X + np.array(represenation["represenation"]).tolist()
        y  = y + np.array(represenation["label"]).tolist()

    return np.array(X), np.array(y)

In [9]:
def test_accuracy_learningarray(price, name):
    price = price[::-1].reset_index().drop(columns=["index"])
    test_size = 300
    valid_size = 300
    price_train = price[:-test_size-valid_size]
    price_test = price[-test_size:]
    price_valid = price[-test_size-valid_size:-test_size]
    best_K = -1
    best_acc = 0
    for i in range(100):
        # Search 100 K_array and calculate the accuracy_score in the validation set, find the best K_array for validation set
        num = np.random.randint(1, 16)
        K_array = (np.cumsum(np.random.rand(num))*2 - num/2) * 500 + 25000
        X_train, y_train = dataset_learningarray(price_train, K_array)
        X_valid, y_valid = dataset_learningarray(price_valid, K_array)
        classifier = ltb.LGBMClassifier(verbose=-1)
        classifier.fit(X_train, y_train)
        y_predict = classifier.predict(X_valid)
        if accuracy_score(y_predict, y_valid) > best_acc:
            best_acc = accuracy_score(y_predict, y_valid)
            best_K = K_array
    # Use the best K_array for the validation set as the breakpoints to get the representation for the test set
    X_test, y_test = dataset_learningarray(price_test, best_K)
    classifier = ltb.LGBMClassifier(verbose=-1)
    classifier.fit(X_train, y_train)
    y_predict = classifier.predict(X_test)
    print(f"The accuracy score for the {name} test set is {accuracy_score(y_predict, y_test)}\n")

In [10]:
test_accuracy_learningarray(price_dji, "Dow Jones")
test_accuracy_learningarray(price_hs, "Hang Seng")
test_accuracy_learningarray(price_sp500, "S&P 500")
test_accuracy_learningarray(price_sh, "Shanghai Commposite")
test_accuracy_learningarray(price_szse, "SZSE Component")

[LightGBM] [Info] Number of positive: 348, number of negative: 261
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000822 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 100
[LightGBM] [Info] Number of data points in the train set: 609, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.571429 -> initscore=0.287682
[LightGBM] [Info] Start training from score 0.287682
The accuracy score for the Dow Jones test set is 0.552

[LightGBM] [Info] Number of positive: 304, number of negative: 280
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 165
[LightGBM] [Info] Number of data points in the train set: 584, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.520548 -> initscore=0.082238
[LightGBM] [Info] Start tra