SPLIT AND LSTM MODEL

In [None]:
"""Split time-series in traning and test(trading) for classification """
import logging
import argparse
import numpy as np
import pandas as pd
import tqdm

In [None]:
def read_filepath(file_path):
    """
    Read and compute basic informations about a data set in csv.

    Parameters
    ----------
    file_path: str
        Path to the csv file

    Returns
    -------
    df: pandas dataframe
        Pandas Dataframe of the read file
    """
    name_file = file_path.split('/')[-1]
    extention = name_file.split('.')[-1]

    try:
        if extention != 'csv':
            raise NameError('The file is not a csv one')
    except NameError as ne:
        logging.error(ne)
        exit()

    df = pd.read_csv(file_path, encoding='latin-1')

    logging.info('DATA INFO, attributes: %s', df.columns)
    logging.info('DATA INFO, shape: %s', df.shape)

    total_data = df.shape[0]*df.shape[1]
    total_missing = pd.DataFrame.isnull(df).sum().sum()

    logging.info(f'DATA QUALITY, missing values: {total_missing/total_data:.2%}')

    return df

In [None]:
def split_Tperiod(df_returns, df_binary, len_period=1308, len_test=327):
    """
    Split the entire dataframe in study period T, each of them having len_period
    elements of which len_test account for the trading set. To generate all the
    periods, a rolling window of len_period lenght is moved along the entire
    dataset in len_test steps.


    Parameters
    ----------
    df_returns: pandas dataframe
        Pandas dataframe of returns.

    df_binary: pandas dataframe
        Pandas dataframe of binary targets.

    len_period: integer(optional)
        Lenght of the study period

    len_test: integer(optional)
        Lenght of the trading set.
    Results
    -------
    periods: list of pandas dataframe
        List of pandas dataframe of all periods of lenght len_period.
    """

    len_total_leave = len(df_returns)-len_period #ho solo chiamato come unica variabile quella cosa che c'era nel for, il nome è da rivedere
    periods_ret = [(df_returns[i:len_period+i]) for i in range(0, len_total_leave+1, len_test)]
    periods_bin = [(df_binary[i:len_period+i]) for i in range(0, len_total_leave+1, len_test)] # questa mancava
   
    return periods_ret, periods_bin

In [None]:
def split_sequences(returns, targets, n_steps=240):
    """
    Returns the input sequences and target label for classification task.


    Parameters
    ----------
    returns: list, numpy array
        time-series data of returns to split.

    targets: list, numpy array
        time-series data of target to split. It must have the same length of returns

    n_steps: integer(optional)
        number of time steps for each istance. Default = 100

    Results
    -------
    X: list
        Array of the input set, its shape is (len(sequences)-n_steps, n_steps)

    y: list
        Array of the input target, its shape is (len(sequences)-n_steps, 1)
    """
    try:
        returns = returns.to_numpy()
        targets = targets.to_numpy()
    except AttributeError:
        pass

    X = [returns[i:i+n_steps] for i in range(len(returns)-n_steps)]
    y = [targets[i+n_steps] for i in range(len(targets)-n_steps)]

    return X, y

In [None]:
def get_train_set(df_returns, df_binary):
    """
    Return the train set for LSTM.
    The argumets are the returns dataframe and the binary data frame for compute respectively
    the X_train and the y_train for classification task

    Parameters
    ----------
    df_returns: pandas dataframe
        Dataframe of returns

    df_binary:
        Datframe of binary target associated to data returns. It has the same shape of df_returns

    Returns
    -------
    list_tot_X: numpy array
        Array of input data for LSTM

    list_tot_y: numpy array
        Array of input target class for LSTM
    """

    list_tot_X = []
    list_tot_y = []
    for comp in df_returns.columns[1:]:
        X_train, y_train = split_sequences(df_returns[comp], df_binary[comp])
        list_tot_X.append(X_train)
        list_tot_y.append(y_train)

    list_tot_X = np.array(list_tot_X)
    list_tot_y = np.array(list_tot_y)

    list_tot_X = np.reshape(list_tot_X,(list_tot_X.shape[0]*list_tot_X.shape[1],list_tot_X.shape[2]))
    list_tot_y = np.reshape(list_tot_y,(list_tot_y.shape[0]*list_tot_y.shape[1]))

    return list_tot_X, list_tot_y

In [None]:
df_returns = pd.read_csv("ReturnsData.csv")
df_binary = pd.read_csv("ReturnsBinary.csv")

X_train, y_train = split_sequences(df_returns, df_binary)

MODELLO LSTM

In [None]:
"""LSTM model"""
import numpy as np
import pandas as pd
import logging
import argparse
from keras.layers import Input, Dense, LSTM, Dropout
from keras.models import Model, Sequential
from keras.models import load_model
import sys
import os
sys.path.append(os.path.dirname(os.path.abspath("..")))
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [None]:
df_returns = read_filepath("ReturnsData.csv")
df_binary = read_filepath("ReturnsBinary.csv")

In [None]:
def all_data_LSTM(df_returns, df_binary, period, len_train=981, len_test=327):

    periods_returns, periods_binary = split_Tperiod(df_returns, df_binary)
    T1_input = periods_returns[period]
    T1_target = periods_binary[period]

    X_input_train, y_input_train = T1_input[:len_train], T1_target[:len_train]

    X_test, y_test = T1_input[len_test:], T1_target[len_test:]

    X_train, y_train = get_train_set(X_input_train, y_input_train)
    X_train, y_train = np.array(X_train), np.array(y_train)

    X_test, y_test = get_train_set(X_test, y_test)
    X_test, y_test = np.array(X_test), np.array(y_test)

    X_train = np.reshape(X_input_train, (X_input_train.shape[0], X_input_train.shape[1], 1))
    return X_train, y_train, X_test, y_test

In [None]:
len_train=981
len_test=327
periods_returns, periods_binary = split_Tperiod(df_returns, df_binary)
T1_input = periods_returns[1]
T1_target = periods_binary[1]
# print(T1_input)
# print(T1_target)

X_input_train, y_input_train = T1_input[:len_train], T1_target[:len_train]
scaler = StandardScaler()
X_input_train = scaler.fit_transform(X_input_train)
X_test, y_test = T1_input[len_test:], T1_target[len_test:]
X_test = scaler.fit_transform(X_test)
print(X_input_train)
# X_train, y_train = get_train_set(X_input_train, y_input_train)
# X_train, y_train = np.array(X_train), np.array(y_train)

# X_test, y_test = get_train_set(X_test, y_test)
# X_test, y_test = np.array(X_test), np.array(y_test)
#
# X_train = np.reshape(X_input_train, (X_input_train.shape[0], X_input_train.shape[1], 1))

In [None]:
def LSTM_model():
    inputs = Input(shape= (240, 1))
    hidden = LSTM(25)(inputs)
    drop = Dropout(0.1)(hidden)
    outputs = Dense(2, activation='softmax')(drop)

    model = Model(inputs=inputs, outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
model = LSTM_model()
print(model.summary())
#Modello per il primo periodo
X_train, y_train, X_test, y_test = all_data_LSTM(df_returns, df_binary, 1)
history = model.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2, verbose=1)
# model.save("LSTM_0_period.h5")