In [2]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_percentage_error



import talib

In [1]:
# Define the Linear Regression model- to support multi-output
class LinearRegressionMultiOutputBatch:
    def __init__(self, learning_rate=0.01, n_epochs=1, batch_size=32):
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs  # Number of passes over the entire dataset
        self.batch_size = batch_size  # Size of each mini-batch
        self.weights = None
        self.bias = None

    def fit(self, X, y):
        num_samples, num_features = X.shape
        num_outputs = y.shape[1] if y.ndim > 1 else 1

        # Initialize weights and biases for multiple outputs
        self.weights = np.zeros((num_features, num_outputs))  # Shape: (num_features, num_outputs)
        self.bias = np.zeros(num_outputs)  # Shape: (num_outputs,)

        num_batches = int(np.ceil(num_samples / self.batch_size))

        for epoch in range(self.n_epochs):
        
            # Process the data in batches without shuffling
            for batch_idx in range(num_batches):
                # Determine batch start and end indices
                start_idx = batch_idx * self.batch_size
                end_idx = min(start_idx + self.batch_size, num_samples)

                # Extract the batch data
                X_batch = X[start_idx:end_idx]
                y_batch = y[start_idx:end_idx]

                # Calculate predictions: Shape of y_pred -> (batch_size, num_outputs)
                y_pred = np.dot(X_batch, self.weights) + self.bias

                # Compute gradients
                batch_size_actual = X_batch.shape[0]  # Adjust for last batch

                residuals = y_pred - y_batch  # Shape: (batch_size, num_outputs)

                dw = (1 / batch_size_actual) * np.dot(X_batch.T, residuals)  # Shape: (num_features, num_outputs)
                db = (1 / batch_size_actual) * np.sum(residuals, axis=0)  # Shape: (num_outputs,)

                # Update weights and biases
                self.weights -= self.learning_rate * dw
                self.bias -= self.learning_rate * db

    def predict(self, X):
        return np.dot(X, self.weights) + self.bias  # Shape: (num_samples, num_outputs)

# Feature Engineering 
def multivariateFeatureEngineering(data):
    
    #Trend following Indicators:

    #SMA - identofy long term trend
    data['50_sma'] = data['Close'].rolling(window=50).mean() 
    data['200_sma'] = data['Close'].rolling(window=200).mean() 

    #EMA - trend analysis: more weight applied to recent points
    data['50_ema'] = data['Close'].ewm(span=50, adjust=False).mean()
    data['100_ema'] = data['Close'].ewm(span=100, adjust=False).mean()

    #MACD
    data['12_ema'] = data['Close'].ewm(span=12, adjust=False).mean()
    data['26_ema'] = data['Close'].ewm(span=26, adjust=False).mean()

    data['MACD_line'] = data['12_ema']-data['26_ema'] # calculate the MACD line
    data['Signal_line'] = data['MACD_line'].ewm(span=9, adjust=False).mean() # 9-preiod ema signal calculated from the Macdline
    # data['MACD_histogram'] = data['MACD_line'] - data['Signal_line']

    #ADX
    # Calculate ADX using TA-Lib (14-period by default)
    data['ADX'] = talib.ADX(data['High'], data['Low'], data['Close'], timeperiod=14)

    #Momentum indicators:

    #RSI - 14-period
    data['RSI'] = talib.RSI(data['Close'], timeperiod=14)
    
    #Stochastic Oscillator
    data['stoch_k'], data['stoch_d'] = talib.STOCH(data['High'], data['Low'], data['Close'], 
                                                fastk_period=14, slowk_period=3, slowd_period=3)

    #Volatility indicators#:

    #ATR -Default period for ATR is 14
    data['ATR'] = talib.ATR(data['High'], data['Low'], data['Close'], timeperiod=14)

    # print(dataset.isnull().any(axis=1).sum())
    print(data.head(50))
    data = data.dropna() # drop rows that have NA

    #drop certain featires
    data = data.drop(columns=['12_ema', '26_ema'])

    return data


def multivariateFeatureLagMultiStep(data, n_past, future_steps, target_column=1):
    features = []
    response = []

    max_future_step = max(future_steps)
    num_features = data.shape[1]
    group_feature_lags =  1 # change grouping of lagged features

    # Adjust the loop to prevent index out of bounds
    for i in range(n_past, len(data) - max_future_step + 1):

        if group_feature_lags==1:
                
            lagged_features = []

            for feature_idx in range(num_features):
                feature_lags = data.iloc[i - n_past:i, feature_idx].values 
                lagged_features.extend(feature_lags) 

        elif group_feature_lags==0:
            features.append(data.iloc[i - n_past:i, :].values)  # Take all columns as features

        # Use .iloc for integer-based indexing and .values to get a NumPy array

        if group_feature_lags==1:
            features.append(lagged_features)

        # Extract the target values at specified future steps using .iloc
        response.append([data.iloc[i + step - 1, target_column] for step in future_steps])

    # Convert lists to NumPy arrays after the loop
    features = np.array(features)  # Shape: (num_samples, n_past, num_features)
    response = np.array(response)  # Shape: (num_samples, len(future_steps))

    # Flatten the features to 2D array: (num_samples, n_past * num_features)
    features_flat = features.reshape(features.shape[0], -1)

    return features_flat, response


# Data loading
def data_loader(filepath):
    data = pd.read_csv(filepath)
    data['Time'] = pd.to_datetime(data['Time'],format='%Y-%m-%d %H:%M:%S')
    data.set_index('Time', inplace=True)

    return data

# Function to evaluate model predictions
def evaluate_model(y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    mape = mean_absolute_percentage_error(y_true, y_pred)
    mbe = np.mean(y_pred - y_true)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    
    return mse, mae, mape, mbe, rmse, r2


# Initialize CSV file and write headers
# Initialize CSV file and write headers
def initialize_csv(file_name):
    headers = ['lookback_window', 'features_used', 'learning_rate', 'batch_size', 'epochs', 
               'MSE_1_day', 'MAE_1_day', 'MAPE_1_day', 'MBE_1_day', 'RMSE_1_day', 'R2_1_day',
               'MSE_3_day', 'MAE_3_day', 'MAPE_3_day', 'MBE_3_day', 'RMSE_3_day', 'R2_3_day',
               'MSE_5_day', 'MAE_5_day', 'MAPE_5_day', 'MBE_5_day', 'RMSE_5_day', 'R2_5_day']
    df = pd.DataFrame(columns=headers)
    df.to_csv(file_name, index=False)

# Append a single result (row) to the CSV file
def append_result_to_csv(file_name, result):
    df = pd.DataFrame([result])  # Convert result dictionary to DataFrame
    df.to_csv(file_name, mode='a', header=False, index=False)  # Append to CS

#                                    /* Feature combinations*/
def featuresComblist(features):
    import itertools

    initial_feature = ['Close'] # Starting with the closing price

    # Get all combinations of the features list and add to the initial feature (Closing Price)
    feature_combinations = []
    for i in range(len(features) + 1):
        for combination in itertools.combinations(features, i):
            feature_combinations.append(list(combination)+ initial_feature )
    
    return feature_combinations

############################################# Load data and Feature Engineer ###############################################

# filepath = './Data/EURUSD_D1.csv' 
# dataset = data_loader(filepath)

# # Generate additional Features 
# multiVarData = multivariateFeatureEngineering(dataset) 
# cols  = [col for col in multiVarData.columns if col!='Close'] + ['Close'] # Put target on End
# multiVarData = multiVarData[cols]


# ################################  Generate different hyper-parameter values for grid search #################################


# features = ['Open', 'High', 'Low', 'Volume', '50_sma', '200_sma', '50_ema',
#        '100_ema', 'MACD_line', 'Signal_line', 'ADX', 'RSI', 'stoch_k',
#        'stoch_d', 'ATR']

# feature_combinations = featuresComblist(features) # feature cominations
# lookback_windows =  [1 , 3, 5, 7, 10]  # look back window
# future_steps = [1, 3, 5]  # model outputs
# learning_rate = [0.1, 0.01, 0.001] 
# batch_sizes = [16, 32, 64]
# epochs = [20, 50, 100]

# target_col = -1  # response

# ######## build model based on this combinato

# features, response = multivariateFeatureLagMultiStep(multiVarData, n_past,future_steps,  target_col)

# #################################### Build Forecast model ########################

# #train test split
# X_train, X_test, Y_train, Y_test = train_test_split(features, response, test_size=0.2, random_state=12, shuffle=False)

# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


# learning_rate = 0.01
# n_epochs = 5
# batch_size =  32
# model_lr = LinearRegressionMultiOutputBatch(learning_rate, n_epochs,batch_size)
# model_lr.fit(X_train, Y_train)

# predict = model_lr.predict(X_test)
# print(predict.shape, Y_test.shape)



############################ Hyper-Paramert Tuning: Grid Search #######################


# Grid search through all combinations of hyperparameters
def grid_search():
    # Initialize CSV to store results
    file_name = 'hyperparameter_tuning_results.csv'
    initialize_csv(file_name)
    
    # Define hyperparameter grid
    features = ['Open', 'High', 'Low', 'Volume', '50_sma', '200_sma', '50_ema',
                '100_ema', 'MACD_line', 'Signal_line', 'ADX', 'RSI', 'stoch_k',
                'stoch_d', 'ATR']
    
    feature_combinations = featuresComblist(features)  # feature combinations
    lookback_windows = [1, 3, 5, 7, 10, 15]
    learning_rates = [0.1, 0.01, 0.001]
    batch_sizes = [16, 32, 64]
    epochs = [20, 50, 100]

    # Load your data
    filepath = './Data/EURUSD_D1.csv' 
    dataset = data_loader(filepath)
    multiVarData = multivariateFeatureEngineering(dataset)
    cols  = [col for col in multiVarData.columns if col!='Close'] + ['Close'] # Put target on End
    multiVarData = multiVarData[cols]
    #Shift clost to the end

    print(multiVarData.head())
    target_col = -1
    
    # Loop over all combinations of hyperparameters
    for features_used in feature_combinations:
        for lookback_window in lookback_windows:
            for lr in learning_rates:
                for batch_size in batch_sizes:
                    for epoch in epochs:

                        future_steps = [1, 3, 5]  # model outputs

                        # Prepare features and response for the current hyperparameter combination
                        features, response = multivariateFeatureLagMultiStep(multiVarData, lookback_window, future_steps, target_col)
                        
                        # Split data into training and testing sets
                        X_train, X_test, Y_train, Y_test = train_test_split(features, response, test_size=0.2, random_state=12, shuffle=False)

                        # Standardize features
                        scaler = StandardScaler()
                        X_train = scaler.fit_transform(X_train)
                        X_test = scaler.transform(X_test)

                        # Train the model with the current hyperparameter combination
                        model_lr = LinearRegressionMultiOutputBatch(lr, epoch, batch_size)
                        model_lr.fit(X_train, Y_train)

                        # Make predictions
                        predictions = model_lr.predict(X_test)
                        
                        # Evaluate the model for each forecast horizon
                        mse_1_day, mae_1_day, mape_1_day, mbe_1_day, rmse_1_day, r2_1_day = evaluate_model(Y_test[:, 0], predictions[:, 0])
                        mse_3_day, mae_3_day, mape_3_day, mbe_3_day, rmse_3_day, r2_3_day = evaluate_model(Y_test[:, 1], predictions[:, 1])
                        mse_5_day, mae_5_day, mape_5_day, mbe_5_day, rmse_5_day, r2_5_day = evaluate_model(Y_test[:, 2], predictions[:, 2])

                        # Store the results
                        result = {
                            'lookback_window': lookback_window,
                            'features_used': features_used,
                            'learning_rate': lr,
                            'batch_size': batch_size,
                            'epochs': epoch,
                            'MSE_1_day': mse_1_day,
                            'MAE_1_day': mae_1_day,
                            'MAPE_1_day': mape_1_day,
                            'MBE_1_day': mbe_1_day,
                            'RMSE_1_day': rmse_1_day,
                            'R2_1_day': r2_1_day,
                            'MSE_3_day': mse_3_day,
                            'MAE_3_day': mae_3_day,
                            'MAPE_3_day': mape_3_day,
                            'MBE_3_day': mbe_3_day,
                            'RMSE_3_day': rmse_3_day,
                            'R2_3_day': r2_3_day,
                            'MSE_5_day': mse_5_day,
                            'MAE_5_day': mae_5_day,
                            'MAPE_5_day': mape_5_day,
                            'MBE_5_day': mbe_5_day,
                            'RMSE_5_day': rmse_5_day,
                            'R2_5_day': r2_5_day
                        }

                        # Append the result to the CSV file
                        append_result_to_csv(file_name, result)

    print("Grid search completed and results saved to CSV!")


############################### HYPER-PARAMETER TUNING ###############################

grid_search() #start search

NameError: name 'pd' is not defined