In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import os
import time
import math
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Get Org Dataset
Obtain the training and test data sets, perform standardization and logarithmic transformation, and verify whether the normalization and logarithmic transformation are successful.

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

# Read the CSV file
org_df = pd.read_csv('/content/drive/MyDrive/CUNY/Capstone/Data/model/Data_set_for_training_and_testing_final.csv')

# Convert 'Date' to datetime and set as index
org_df['Date'] = pd.to_datetime(org_df['Date'])
org_df.set_index('Date', inplace=True)

# Split data into features (X) and target (y)
X = org_df.drop(["Close", "up_down"], axis=1).copy()
y = org_df[["Close", "up_down"]].copy()


# Replace infinite values with NaN
X.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for infinite values in each column
inf_columns = X.columns[X.isin([np.inf, -np.inf]).any()].tolist()
print("Columns with infinite values:", inf_columns)

# Fill NaN values with the maximum value of each column
X.fillna(X.max(), inplace=True)

# Check the count of NaN values in each column
print("NaN counts per column:")
print(X.isnull().sum())

# Scale the features

# Convert the scaled features to a DataFrame
X_df= pd.DataFrame(X, columns=X.columns, index=X.index)

# Combine the target values (y) with the scaled features (X_scaled_df)
org_linear_regression_df = pd.concat([X_df, y['Close']], axis=1)
org_binary_classification_df = pd.concat([X_df, y['up_down']], axis=1)


# # Revert transformations
# # 1. Inverse Scaling
# # Restore to original scale
# X_return = scaler.inverse_transform(X_scaled_df)

# # Create a DataFrame for the restored values
# X_return_df = pd.DataFrame(X_return, columns=X.columns, index=X.index)

# linear_regression_return_df = pd.concat([y['Close'], X_return_df], axis=1)

# # 2. Inverse Log Transformation for 'Close'
# linear_regression_return_df['Close'] = np.expm1(org_linear_regression_df['Close'])

# # Create a new dataframe with the original values
# original_values_df = pd.concat([y['Close'], X_return_df], axis=1)

# # Compare to verify the transformations
# comparison_df = original_values_df - linear_regression_return_df

# print(comparison_df.head())


Columns with infinite values: []
NaN counts per column:
previous_date_close_1    0
previous_date_close_2    0
previous_date_close_3    0
previous_date_close_4    0
previous_date_close_5    0
                        ..
WILLR_7_delta            0
CCI_14_delta             0
ROC_14_delta             0
WILLR_14_delta           0
Change_abs               0
Length: 231, dtype: int64


In [None]:
def select_dataset(org_linear_regression_df, org_binary_classification_df, model_type, analysis):

    # Define a dictionary for column selections
    columns_dict = {
        'FA': [
              'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
              'SOFR', 'HQMCB10YR', 'WM1NS', 'WM2NS', 'CC4WSA', 'CPIAUCSL', 'CSUSHPINSA', 'DCOILWTICO', 'FYFSD', 'GFDEBTN', 'GDPC1', 'ICSA', 'PAYEMS',
              'USSLIND', 'USALOLITONOSTSAM', 'DTWEXBGS', 'DEXUSEU', 'DEXJPUS', 'DEXCHUS', 'DEXUSUK', 'BOGMBASE', 'DGORDER', 'PMI_Actual', 'PMI_Forecast',
              'PMI_Previous', 'VIXCLS', 'Close_delta', 'Open_delta',
              'SOFR_delta', 'HQMCB10YR_delta', 'WM1NS_delta', 'WM2NS_delta', 'CC4WSA_delta', 'CPIAUCSL_delta', 'CSUSHPINSA_delta', 'DCOILWTICO_delta',
              'FYFSD_delta', 'GFDEBTN_delta', 'GDPC1_delta', 'ICSA_delta', 'PAYEMS_delta', 'USSLIND_delta', 'USALOLITONOSTSAM_delta', 'DTWEXBGS_delta',
              'DEXUSEU_delta', 'DEXJPUS_delta', 'DEXCHUS_delta', 'DEXUSUK_delta', 'BOGMBASE_delta', 'DGORDER_delta', 'PMI_Actual_delta', 'PMI_Forecast_delta',
              'PMI_Previous_delta', 'VIXCLS_delta',
              'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5'
        ],

        'FA1': [
              'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
              'SOFR', 'HQMCB10YR', 'WM1NS', 'WM2NS', 'CC4WSA', 'CPIAUCSL', 'CSUSHPINSA', 'DCOILWTICO', 'FYFSD', 'GFDEBTN', 'GDPC1', 'ICSA', 'PAYEMS',
              'USSLIND', 'USALOLITONOSTSAM', 'DTWEXBGS', 'DEXUSEU', 'DEXJPUS', 'DEXCHUS', 'DEXUSUK', 'BOGMBASE', 'DGORDER', 'PMI_Actual', 'PMI_Forecast',
              'PMI_Previous', 'VIXCLS', 'Close_delta', 'Open_delta',
               'VIXCLS_delta',
              'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5'
        ],

        'SA': [
        'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
        'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5',
        'negative', 'neutral', 'positive', 'negative_weight', 'neutral_weight', 'positive_weight',
        'SA_simple_score', 'SA_weighted_score',
        'Positive_count', 'Neutral_count', 'Negative_count',
        'positive_score_percentage_simple', 'negative_score_percentage_simple',
        'positive_negative_score_ratio',
        'positive_score_percentage_weighted', 'negative_score_percentage_weighted',
        'positive_negative_score_ratio_weighted',
        'positive_count_percentage', 'negative_count_percentage',
        'positive_negative_count_percentage_ratio',
        'Positive_count_weighted', 'Neutral_count_weighted', 'Negative_count_weighted',
        'positive_count_percentage_weighted', 'negative_count_percentage_weighted',
        'positive_negative_count_percentage_ratio_weighted'
        ],

        'TA': [
        'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
        'VIXCLS', 'SMA_5', 'SMA_12', 'SMA_26', 'EMA_5', 'EMA_12', 'EMA_26', 'SMA_50', 'SMA_200',
        'macd_12_26_9', 'macd_h_12_26_9', 'macd_s_12_26_9', 'macd_5_9_3', 'macd_h_5_9_3', 'macd_s_5_9_3',
        'macd_24_52_18', 'macd_h_24_52_18', 'macd_s_24_52_18',
        'STOCHk_9_3', 'STOCHd_9_3', 'STOCHk_18_4', 'STOCHd_18_4', 'STOCHk_35_8', 'STOCHd_35_8',
        'RSI_7', 'RSI_14',
        'BB_lower_band_5', 'BB_middle_band_5', 'BB_upper_band_5', 'BB_Width_5', 'BB_Percent_5',
        'BB_lower_band_10', 'BB_middle_band_10', 'BB_upper_band_10', 'BB_Width_10', 'BB_Percent_10',
        'BB_lower_band_20', 'BB_middle_band_20', 'BB_upper_band_20', 'BB_Width_20', 'BB_Percent_20',
        'OBV',
        'ADX_14', 'DMP_14', 'DMN_14', 'ADX_30', 'DMP_30', 'DMN_30', 'ADX_50', 'DMP_50', 'DMN_50',
        'Fib_0.236_5', 'Fib_0.382_5', 'Fib_0.618_5', 'Fib_0.236_20', 'Fib_0.382_20', 'Fib_0.618_20',
        'Fib_0.236_100', 'Fib_0.382_100', 'Fib_0.618_100',
        'CCI_7', 'ROC_7', 'WILLR_7', 'CCI_14', 'ROC_14', 'WILLR_14',
        'VIXCLS_delta', 'SMA_5_delta', 'SMA_12_delta', 'SMA_26_delta', 'EMA_5_delta', 'EMA_12_delta', 'EMA_26_delta',
        'SMA_50_delta', 'SMA_200_delta',
        'macd_12_26_9_delta', 'macd_h_12_26_9_delta', 'macd_s_12_26_9_delta', 'macd_5_9_3_delta', 'macd_h_5_9_3_delta', 'macd_s_5_9_3_delta',
        'macd_24_52_18_delta', 'macd_h_24_52_18_delta', 'macd_s_24_52_18_delta',
        'STOCHk_9_3_delta', 'STOCHd_9_3_delta', 'STOCHk_18_4_delta', 'STOCHd_18_4_delta', 'STOCHk_35_8_delta', 'STOCHd_35_8_delta',
        'RSI_7_delta', 'RSI_14_delta',
        'BB_lower_band_5_delta', 'BB_middle_band_5_delta', 'BB_upper_band_5_delta', 'BB_Width_5_delta', 'BB_Percent_5_delta',
        'BB_lower_band_10_delta', 'BB_middle_band_10_delta', 'BB_upper_band_10_delta', 'BB_Width_10_delta', 'BB_Percent_10_delta',
        'BB_lower_band_20_delta', 'BB_middle_band_20_delta', 'BB_upper_band_20_delta', 'BB_Width_20_delta', 'BB_Percent_20_delta',
        'OBV_delta',
        'ADX_14_delta', 'DMP_14_delta', 'DMN_14_delta', 'ADX_30_delta', 'DMP_30_delta', 'DMN_30_delta', 'ADX_50_delta', 'DMP_50_delta', 'DMN_50_delta',
        'Fib_0.236_5_delta', 'Fib_0.382_5_delta', 'Fib_0.618_5_delta', 'Fib_0.236_20_delta', 'Fib_0.382_20_delta', 'Fib_0.618_20_delta',
        'Fib_0.236_100_delta', 'Fib_0.382_100_delta', 'Fib_0.618_100_delta',
        'CCI_7_delta', 'ROC_7_delta', 'WILLR_7_delta', 'CCI_14_delta', 'ROC_14_delta', 'WILLR_14_delta',
        'Change_abs',
        'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
        ],

        'CA': [
          'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
          'far_month_openInterest_0', 'near_month_openInterest', 'total_month_openInterest', 'far_month_openInterest_0_delta', 'near_month_openInterest_delta', 'total_month_openInterest_delta',
          'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
        ],

        'PA': [
          'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
          'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
        ],

        'PAS': [
          'Open', 'High', 'Low', 'Vol.',
        ]

    }

    # Check if the analysis type is valid
    if analysis not in columns_dict:
        raise ValueError(f"Invalid analysis type: {analysis}")

    selected_columns = columns_dict[analysis]

    if model_type == "linear":
        linear_regression_df = org_linear_regression_df[selected_columns + ['Close']]
        return linear_regression_df

    elif model_type == "binary":
        binary_classification_df = org_binary_classification_df[[selected_columns + 'up_down']]
        return binary_classification_df

    else:
        raise ValueError(f"Invalid model type: {model_type}")




In [None]:
columns_dict = {
    'FA': [
          'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
          'SOFR', 'HQMCB10YR', 'WM1NS', 'WM2NS', 'CC4WSA', 'CPIAUCSL', 'CSUSHPINSA', 'DCOILWTICO', 'FYFSD', 'GFDEBTN', 'GDPC1', 'ICSA', 'PAYEMS',
          'USSLIND', 'USALOLITONOSTSAM', 'DTWEXBGS', 'DEXUSEU', 'DEXJPUS', 'DEXCHUS', 'DEXUSUK', 'BOGMBASE', 'DGORDER', 'PMI_Actual', 'PMI_Forecast',
          'PMI_Previous', 'VIXCLS', 'Close_delta', 'Open_delta',
          'SOFR_delta', 'HQMCB10YR_delta', 'WM1NS_delta', 'WM2NS_delta', 'CC4WSA_delta', 'CPIAUCSL_delta', 'CSUSHPINSA_delta', 'DCOILWTICO_delta',
          'FYFSD_delta', 'GFDEBTN_delta', 'GDPC1_delta', 'ICSA_delta', 'PAYEMS_delta', 'USSLIND_delta', 'USALOLITONOSTSAM_delta', 'DTWEXBGS_delta',
          'DEXUSEU_delta', 'DEXJPUS_delta', 'DEXCHUS_delta', 'DEXUSUK_delta', 'BOGMBASE_delta', 'DGORDER_delta', 'PMI_Actual_delta', 'PMI_Forecast_delta',
          'PMI_Previous_delta', 'VIXCLS_delta',
          'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5'
    ],

    'FA1': [
          'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
          'SOFR', 'HQMCB10YR', 'WM1NS', 'WM2NS', 'CC4WSA', 'CPIAUCSL', 'CSUSHPINSA', 'DCOILWTICO', 'FYFSD', 'GFDEBTN', 'GDPC1', 'ICSA', 'PAYEMS',
          'USSLIND', 'USALOLITONOSTSAM', 'DTWEXBGS', 'DEXUSEU', 'DEXJPUS', 'DEXCHUS', 'DEXUSUK', 'BOGMBASE', 'DGORDER', 'PMI_Actual', 'PMI_Forecast',
          'PMI_Previous', 'VIXCLS', 'Close_delta', 'Open_delta',
            'VIXCLS_delta',
          'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5'
    ],

    'SA': [
    'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
    'previous_date_close_1',	'previous_date_close_2',	'previous_date_close_3',	'previous_date_close_4',	'previous_date_close_5',
    'negative', 'neutral', 'positive', 'negative_weight', 'neutral_weight', 'positive_weight',
    'SA_simple_score', 'SA_weighted_score',
    'Positive_count', 'Neutral_count', 'Negative_count',
    'positive_score_percentage_simple', 'negative_score_percentage_simple',
    'positive_negative_score_ratio',
    'positive_score_percentage_weighted', 'negative_score_percentage_weighted',
    'positive_negative_score_ratio_weighted',
    'positive_count_percentage', 'negative_count_percentage',
    'positive_negative_count_percentage_ratio',
    'Positive_count_weighted', 'Neutral_count_weighted', 'Negative_count_weighted',
    'positive_count_percentage_weighted', 'negative_count_percentage_weighted',
    'positive_negative_count_percentage_ratio_weighted'
    ],

    'TA': [
    'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
    'VIXCLS', 'SMA_5', 'SMA_12', 'SMA_26', 'EMA_5', 'EMA_12', 'EMA_26', 'SMA_50', 'SMA_200',
    'macd_12_26_9', 'macd_h_12_26_9', 'macd_s_12_26_9', 'macd_5_9_3', 'macd_h_5_9_3', 'macd_s_5_9_3',
    'macd_24_52_18', 'macd_h_24_52_18', 'macd_s_24_52_18',
    'STOCHk_9_3', 'STOCHd_9_3', 'STOCHk_18_4', 'STOCHd_18_4', 'STOCHk_35_8', 'STOCHd_35_8',
    'RSI_7', 'RSI_14',
    'BB_lower_band_5', 'BB_middle_band_5', 'BB_upper_band_5', 'BB_Width_5', 'BB_Percent_5',
    'BB_lower_band_10', 'BB_middle_band_10', 'BB_upper_band_10', 'BB_Width_10', 'BB_Percent_10',
    'BB_lower_band_20', 'BB_middle_band_20', 'BB_upper_band_20', 'BB_Width_20', 'BB_Percent_20',
    'OBV',
    'ADX_14', 'DMP_14', 'DMN_14', 'ADX_30', 'DMP_30', 'DMN_30', 'ADX_50', 'DMP_50', 'DMN_50',
    'Fib_0.236_5', 'Fib_0.382_5', 'Fib_0.618_5', 'Fib_0.236_20', 'Fib_0.382_20', 'Fib_0.618_20',
    'Fib_0.236_100', 'Fib_0.382_100', 'Fib_0.618_100',
    'CCI_7', 'ROC_7', 'WILLR_7', 'CCI_14', 'ROC_14', 'WILLR_14',
    'VIXCLS_delta', 'SMA_5_delta', 'SMA_12_delta', 'SMA_26_delta', 'EMA_5_delta', 'EMA_12_delta', 'EMA_26_delta',
    'SMA_50_delta', 'SMA_200_delta',
    'macd_12_26_9_delta', 'macd_h_12_26_9_delta', 'macd_s_12_26_9_delta', 'macd_5_9_3_delta', 'macd_h_5_9_3_delta', 'macd_s_5_9_3_delta',
    'macd_24_52_18_delta', 'macd_h_24_52_18_delta', 'macd_s_24_52_18_delta',
    'STOCHk_9_3_delta', 'STOCHd_9_3_delta', 'STOCHk_18_4_delta', 'STOCHd_18_4_delta', 'STOCHk_35_8_delta', 'STOCHd_35_8_delta',
    'RSI_7_delta', 'RSI_14_delta',
    'BB_lower_band_5_delta', 'BB_middle_band_5_delta', 'BB_upper_band_5_delta', 'BB_Width_5_delta', 'BB_Percent_5_delta',
    'BB_lower_band_10_delta', 'BB_middle_band_10_delta', 'BB_upper_band_10_delta', 'BB_Width_10_delta', 'BB_Percent_10_delta',
    'BB_lower_band_20_delta', 'BB_middle_band_20_delta', 'BB_upper_band_20_delta', 'BB_Width_20_delta', 'BB_Percent_20_delta',
    'OBV_delta',
    'ADX_14_delta', 'DMP_14_delta', 'DMN_14_delta', 'ADX_30_delta', 'DMP_30_delta', 'DMN_30_delta', 'ADX_50_delta', 'DMP_50_delta', 'DMN_50_delta',
    'Fib_0.236_5_delta', 'Fib_0.382_5_delta', 'Fib_0.618_5_delta', 'Fib_0.236_20_delta', 'Fib_0.382_20_delta', 'Fib_0.618_20_delta',
    'Fib_0.236_100_delta', 'Fib_0.382_100_delta', 'Fib_0.618_100_delta',
    'CCI_7_delta', 'ROC_7_delta', 'WILLR_7_delta', 'CCI_14_delta', 'ROC_14_delta', 'WILLR_14_delta',
    'Change_abs',
    'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
    ],

    'CA': [
      'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
      'far_month_openInterest_0', 'near_month_openInterest', 'total_month_openInterest', 'far_month_openInterest_0_delta', 'near_month_openInterest_delta', 'total_month_openInterest_delta',
      'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
    ],

    'PA': [
      'Open', 'High', 'Low', 'Vol.', 'Change %', 'Close_delta', 'Open_delta', 'High_delta', 'Low_delta', 'Vol._delta', 'Change %_delta',
      'previous_date_close_1', 'previous_date_close_2', 'previous_date_close_3', 'previous_date_close_4', 'previous_date_close_5'
    ],

    'PAS': [
      'Open', 'High', 'Low', 'Vol.',
    ]

}

len(columns_dict.get('TA'))

149

In [None]:
def create_dataset(dataset, lookback):
    X, y = [], []
    for i in range(len(dataset)-lookback):
        feature = dataset[i:i+lookback, :-1]
        target = dataset[i+1:i+lookback+1][-1][-1]
        X.append(feature)
        y.append(target)
    return torch.FloatTensor(X).to(device), torch.FloatTensor(y).view(-1, 1).to(device)



In [None]:
def loss_curve(epochs, train_loss, test_loss):

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=np.arange(epochs), y=train_loss,
                        mode='lines',
                        name='Train Loss'))
    fig.add_trace(go.Scatter(x=np.arange(epochs) , y=test_loss,
                        mode='lines',
                        name='Validation Loss'))
    fig.update_layout(
        title="Loss curve for single lstm",
        xaxis_title="epochs",
        yaxis_title="rmse"
    )
    fig.show()
# loss_curve(epochs, train_loss, test_loss)

In [None]:
import plotly.graph_objects as go
from sklearn.preprocessing import StandardScaler


def predict_plot(model, X_train, X_val, data, model_name):

    train_plot = np.ones_like(data[:, 3]) * np.nan
    test_plot = np.ones_like(data[:, 3]) * np.nan




    with torch.no_grad():
        # Predictions on the training set
        y_pred_train = model(X_train)

        train_plot[lookback:int(0.8 * len(data))] = y_pred_train.view(-1).cpu()

        # Predictions on the validation set
        y_pred_val = model(X_val)
        test_plot[int(0.8 * len(data))+lookback:] = y_pred_val.view(-1).cpu()

    train_price = (train_plot * scaler.scale_[0]) + scaler.mean_[0]
    test_price = (test_plot * scaler.scale_[0]) + scaler.mean_[0]
    real_price = (data[:, -1] * scaler.scale_[0]) + scaler.mean_[0]


    # Create the plot
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=dataset.index, y=train_price,
                            mode='lines',
                            name='Train'))
    fig.add_trace(go.Scatter(x=dataset.index, y=test_price,
                            mode='lines',
                            name='Validation'))
    fig.add_trace(go.Scatter(x=dataset.index, y=real_price,
                            mode='lines',
                            name='True'))

    if 'GRU' in model_name:
        fig.update_layout(
            title="Single GRU prediction of S&P 500 E-mini futures Close Price",
            xaxis_title="Dates",
            yaxis_title="Standardized Stock Price"
        )

    elif 'LSTM' in model_name:
        fig.update_layout(
            title="Single LSTM prediction of S&P 500 E-mini futures Close Price",
            xaxis_title="Dates",
            yaxis_title="Standardized Stock Price"
        )
    fig.show()

# predict_plot(model, X_train, X_val, data)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import plotly.graph_objects as go

def statistics_index(model, X_val, y_val):

    # Assuming you have the model predictions and target values:
    y_pred = model(X_val)  # Replace with your model's prediction method
    y_true = y_val

    # Move to CPU if they are on GPU:
    if isinstance(y_pred, torch.Tensor) and isinstance(y_true, torch.Tensor):
        y_pred = y_pred.cpu().detach().numpy()
        y_true = y_true.cpu().detach().numpy()

    # Now you can use them with NumPy:
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)

    # Summary
    print(f"\nMSE: {mse}")
    print(f"MAE: {mae}")
    print(f"RMSE: {rmse}")
    print(f"R^2 Score: {r2}")

    return mse, mae, rmse, r2

# Example usage:
# mse, mae, rmse, r2 = statistics_index(model, X_val, y_val)


In [None]:
import torch
import datetime



def save_model(model, model_name):
    datetime_now = datetime.datetime.now()
    est = pytz.timezone('US/Eastern')
    datetime_est = datetime_now.astimezone(est)
    current_time = datetime_est.strftime("%Y%m%d_%H%M%S")
    print(model_name)

    torch.save(model.state_dict(), f"/content/drive/MyDrive/CUNY/Capstone/Data/model/S_LSTM/{model_name}_{current_time}.pth")
    return current_time


In [None]:
import pandas as pd
import datetime
import pytz

def save_record(csv_file_path):
    # Assuming your existing CSV file is named "S_GRU.csv"
    # csv_file_path = "/content/drive/MyDrive/CUNY/Capstone/Data/model/S_GRU/S_GRU_record.csv"

    # Read the existing CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)

    # Your experimental results, you can replace this with your actual results
    results = {
        'Time': datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%Y-%m-%d %H:%M:%S"),
        'analyst': analyst,
        'model_name': model_name,
        'model_task': model_task,
        'hidden_size1': hidden_size1,
        'dropout1': dropout1,
        'learning_rate': learning_rate,
        'mse': mse,
        'mae': mae,
        'rmse': rmse,
        'r2': r2,
    }

    # Append the results to the DataFrame
    df = df.append(results, ignore_index=True)

    # Save the updated DataFrame back to the CSV file
    df.to_csv(csv_file_path, index=False)


# LSTM

In [None]:
import itertools
from tensorflow.keras.callbacks import EarlyStopping

# 建立單層LSTM函式
class S_LSTM(nn.Module):
    def __init__(self, input_size, hidden_size1, dropout1):
        super().__init__()
        self.lstm1 = nn.LSTM(input_size, hidden_size1, num_layers=1, batch_first=True)  # Single LSTM layer
        self.dropout1 = nn.Dropout(dropout1)  # Dropout layer
        self.linear = nn.Linear(hidden_size1, 1)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.dropout1(x)  # Apply dropout
        x = x[:, -1, :]  # Take the last output for prediction
        x = self.linear(x)
        return x
    def hyperparameters(self):
        return {
            'hidden_size1': self.lstm1.hidden_size,  # Access hidden_size instead
            # 'hidden_size2': self.lstm2.hidden_size,  # Commented out as model is single-layer
            'dropout1': self.dropout1.p,
            # 'dropout2': self.dropout2.p,  # Commented out as only one dropout layer
            # Add other hyperparameters as needed
        }

class S_GRU(nn.Module):
    def __init__(self, input_size, hidden_size1, dropout1):
        super().__init__()
        self.lstm1 = nn.GRU(input_size, hidden_size1, num_layers=1, batch_first=True)  # Single LSTM layer
        self.dropout1 = nn.Dropout(dropout1)  # Dropout layer
        self.linear = nn.Linear(hidden_size1, 1)

    def forward(self, x):
        x, _ = self.lstm1(x)
        x = self.dropout1(x)  # Apply dropout
        x = x[:, -1, :]  # Take the last output for prediction
        x = self.linear(x)
        return x
    def hyperparameters(self):
        return {
            'hidden_size1': self.lstm1.hidden_size,  # Access hidden_size instead
            # 'hidden_size2': self.lstm2.hidden_size,  # Commented out as model is single-layer
            'dropout1': self.dropout1.p,
            # 'dropout2': self.dropout2.p,  # Commented out as only one dropout layer
            # Add other hyperparameters as needed
        }



# 建立訓練流程函式
def trainer(epochs, loader, X_train, y_train, X_val, y_val, model, criterion, optimizer):
  train_loss, test_loss = [],[]
  for epoch in range(epochs):
    model.train()
    for batch, (x, y_true) in enumerate(loader):
      y_pred = model(x)
      loss = criterion(y_pred, y_true)
      loss.backward()
      optimizer.step()
      optimizer.zero_grad()
    model.eval()
    with torch.no_grad():
      y_pred = model(X_train)
      train_rmse = np.sqrt(criterion(y_pred, y_train).item())
      train_loss.append(train_rmse)
      y_pred = model(X_val)
      test_rmse = np.sqrt(criterion(y_pred, y_val).item())
      test_loss.append(test_rmse)
      if (epoch+1) % 100 == 0:
        print('epoch %d train rmse %.4f test rmse %.4f' % (epoch+1, train_rmse, test_rmse))
  return train_loss, test_loss


# Generate meta_features

In [None]:
# Specify the target date
target_date = '2022-01-20'

In [None]:
import os
import torch
import re
import copy

model_task = 'linear'
path = "/content/drive/MyDrive/CUNY/Capstone/Data/model/Final"

# Get all files in the directory
all_files = os.listdir(path)

# Filter out directories and keep only files
files_only = [f for f in all_files if os.path.isfile(os.path.join(path, f))]

# Assuming model is the model you loaded
model_dict = {}
meta_features = []

for model_name in files_only:


    # Construct the full path to the saved PyTorch model
    model_path = f'{path}/{model_name}'

    # Load the model
    loaded_model = torch.load(model_path)

    # If the model is loaded successfully, print a confirmation message
    print(f"Model {model_name} loaded successfully!")


    # Extract hyperparameters using regex
    analyst_match_obj = re.search(r'([A-Z]+\d*)', model_name)
    if analyst_match_obj:
        analyst_match = analyst_match_obj.group(1)
    else:
        analyst_match = None

    print(analyst_match)
    match = re.search(r'linear_hidden_size1_(\d+)_dropout1_([\d.]+)_lr_', model_name)

    if match:
        hidden_size = int(match.group(1))
        dropout_rate = float(match.group(2))
        input_size = len(columns_dict.get(analyst_match))
        input_size = len(columns_dict.get(analyst_match))


        print(f"Model Name: {model_name}")
        print(f"Input Size: {input_size}")
        print(f"Hidden Size: {hidden_size}")
        print(f"Dropout Rate: {dropout_rate}")



    # Find the index corresponding to the target date
    target_index = org_linear_regression_df.index.get_loc(target_date)

    # Use iloc to select the rows for the specified date and 4 rows prior
    target_df = org_linear_regression_df.iloc[0: target_index + 1, :]

    analyst = analyst_match
    dataset = select_dataset(target_df, org_binary_classification_df, model_task, analyst)
    # print(dataset)
    # 股價
    data = dataset.values
    # print("data.shape: ", data.shape)
    # 標準化
    scaler = StandardScaler()
    data = scaler.fit_transform(data)
    # 訓練與驗證集
    train, test = data[:int(0.8 * len(data)), :dataset.shape[1]], data[int(0.8 * len(data)):, :dataset.shape[1]]

    lookback = 5 # 設定前五天股價預測下一日
    X_train, y_train = create_dataset(train, lookback = lookback)
    X_val, y_val = create_dataset(test, lookback = lookback)
    X_ALL, y_ALL = create_dataset(data, lookback = lookback)

    # loader = DataLoader(TensorDataset(X_train, y_train), shuffle = False, batch_size = 32)
    feature_amt = X_ALL.shape[2]

    # print('X_ALL.shape: ',X_ALL.shape)
    # print('y_ALL.shape: ',y_ALL.shape)

    # Determine the input_size based on the model name
    if 'GRU' in model_name:
        model = S_GRU(input_size=feature_amt, hidden_size1=hidden_size,  dropout1=dropout_rate).to(device)
    elif 'LSTM' in model_name:
        model = S_LSTM(input_size=feature_amt, hidden_size1=hidden_size,  dropout1=dropout_rate).to(device)
    else:
        input_size = None  # Handle other cases if needed


    # Create the model with hyperparameters from the current combination

    criterion = nn.MSELoss()
    model_path = model_path

    # Load the saved state dictionary into your model
    model.load_state_dict(torch.load(model_path))

    # Ensure the model is in evaluation mode
    model.eval()

    # mse, mae, rmse, r2 = statistics_index(model, X_val, y_val)

    # predict_plot(model, X_train, X_val, data, model_name)

    model_predictions = model(X_ALL).detach().numpy()
    print("model_predictions.shape: ", model_predictions.shape)
    meta_features.append(model_predictions)


    model_dict[f'model_{analyst_match}'] = copy.deepcopy(model)

    # Now you can access the model using the key
    desired_model = model_dict[f'model_{analyst_match}']
    print("_"*50)


# Assuming y_ALL is a PyTorch tensor
y_ALL_np = y_ALL.numpy()

# Reshape the NumPy array to (2824, 1)
y_ALL_np_reshaped = y_ALL_np.reshape(-1, 1)

# Now y_ALL_np_reshaped has shape (2824, 1)
print("y_ALL_np_reshaped.shape:", y_ALL_np_reshaped.shape)



meta_features.append(y_ALL_np_reshaped)


meta_features_np = np.hstack(meta_features)
print("meta_features.shape: ", meta_features_np.shape)

meta_features = pd.DataFrame(np.hstack(meta_features))


print(model_dict)

Model SA_S_GRU_linear_hidden_size1_128_dropout1_0.4_lr_0.005_20240106_004316.pth loaded successfully!
SA
Model Name: SA_S_GRU_linear_hidden_size1_128_dropout1_0.4_lr_0.005_20240106_004316.pth
Input Size: 42
Hidden Size: 128
Dropout Rate: 0.4
model_predictions.shape:  (2579, 1)
__________________________________________________
Model TA_S_GRU_linear_hidden_size1_256_dropout1_0.4_lr_0.001_20240106_201450.pth loaded successfully!
TA
Model Name: TA_S_GRU_linear_hidden_size1_256_dropout1_0.4_lr_0.001_20240106_201450.pth
Input Size: 149
Hidden Size: 256
Dropout Rate: 0.4
model_predictions.shape:  (2579, 1)
__________________________________________________
Model FA1_S_LSTM_linear_hidden_size1_256_dropout1_0.7_lr_0.001_20240108_001558.pth loaded successfully!
FA1
Model Name: FA1_S_LSTM_linear_hidden_size1_256_dropout1_0.7_lr_0.001_20240108_001558.pth
Input Size: 45
Hidden Size: 256
Dropout Rate: 0.7
model_predictions.shape:  (2579, 1)
__________________________________________________
Model P

In [None]:
meta_features = meta_features.reset_index(drop=True)
meta_features.columns = ['SA', 'TA', 'FA1', 'PAS', 'PA', 'CA', 'Close']
meta_features_reverse_df = meta_features* scaler.scale_[0] + scaler.mean_[0]
meta_features_reverse_df


Unnamed: 0,SA,TA,FA1,PAS,PA,CA,Close
0,1310.574951,1151.083984,1317.894409,1334.795410,1404.732056,1370.412476,1284.484863
1,1170.866577,1154.481079,1305.574219,1332.548950,1406.138184,1380.174561,1286.732300
2,1146.857300,1161.816284,1327.841675,1344.779419,1408.576172,1390.992310,1290.228149
3,1146.430786,1172.538696,1300.602051,1345.630371,1391.576416,1403.239014,1287.481323
4,1180.813721,1173.067749,1306.505249,1349.450928,1407.931396,1386.057373,1287.731079
...,...,...,...,...,...,...,...
2574,4624.595703,4416.663086,4592.349121,4676.078125,4917.037109,5037.064453,4649.325195
2575,4515.420898,4267.285156,4473.973633,4627.247070,4893.059082,4845.583008,4651.572754
2576,4668.897461,4372.765137,4581.015625,4748.414062,4806.936523,4826.265625,4565.921875
2577,4730.856934,4406.278809,4484.264648,4668.801270,4829.968750,4937.955078,4518.976562


In [None]:
import os
import torch
import re
import copy

model_task = 'linear'
path = "/content/drive/MyDrive/CUNY/Capstone/Data/model/Final/meta_model/"


model_dict = {}
meta_features = []
model_name = 'meta_model_S_GRU_linear_hidden_size1_256_dropout1_0.2_lr_0.005_20240111_220612.pth'


# Construct the full path to the saved PyTorch model
model_path = f'{path}/{model_name}'

# Load the model
loaded_model = torch.load(model_path)

# If the model is loaded successfully, print a confirmation message
print(f"Model {model_name} loaded successfully!")


# Extract hyperparameters using regex
analyst_match_obj = re.search(r'([A-Z]+\d*)', model_name)
if analyst_match_obj:
    analyst_match = analyst_match_obj.group(1)
else:
    analyst_match = None

print(analyst_match)
match = re.search(r'linear_hidden_size1_(\d+)_dropout1_([\d.]+)_lr_', model_name)

if match:
    hidden_size = 256
    dropout_rate = 0.2
    input_size = 6


    print(f"Model Name: {model_name}")
    print(f"Input Size: {input_size}")
    print(f"Hidden Size: {hidden_size}")
    print(f"Dropout Rate: {dropout_rate}")
    # print(keras_model.summary())


dataset = meta_features_reverse_df
# 股價
data = dataset.values
# 標準化
scaler = StandardScaler()
data = scaler.fit_transform(data)
# 訓練與驗證集
train, test = data[:int(0.8 * len(data)), :dataset.shape[1]], data[int(0.8 * len(data)):, :dataset.shape[1]]

lookback = 5 # 設定前五天股價預測下一日
X_train, y_train = create_dataset(train, lookback = lookback)
X_val, y_val = create_dataset(test, lookback = lookback)
X_ALL, y_ALL = create_dataset(data, lookback = lookback)
# loader = DataLoader(TensorDataset(X_train, y_train), shuffle = False, batch_size = 32)
feature_amt = X_ALL.shape[2]

print('X_val.shape: ',X_val.shape)
print('y_val.shape: ',y_val.shape)



print('X_ALL.shape: ',X_ALL.shape)
print('y_ALL.shape: ',y_ALL.shape)

# Determine the input_size based on the model name
if 'GRU' in model_name:
    model = S_GRU(input_size=feature_amt, hidden_size1=hidden_size,  dropout1=dropout_rate).to(device)
elif 'LSTM' in model_name:
    model = S_LSTM(input_size=feature_amt, hidden_size1=hidden_size,  dropout1=dropout_rate).to(device)
else:
    input_size = None  # Handle other cases if needed


# Create the model with hyperparameters from the current combination

criterion = nn.MSELoss()
model_path = model_path

# Load the saved state dictionary into your model
model.load_state_dict(torch.load(model_path))

# Ensure the model is in evaluation mode
model.eval()

# mse, mae, rmse, r2 = statistics_index(model, X_val, y_val)
# predict_plot(model, X_train, X_val, data, model_name)

model_predictions = model(X_ALL).detach().numpy()
print("model_predictions.shape: ", model_predictions.shape)
meta_features.append(model_predictions)


model_dict[f'model_{analyst_match}'] = copy.deepcopy(model)

# Now you can access the model using the key
desired_model = model_dict[f'model_{analyst_match}']
print("_"*50)


# Assuming y_ALL is a PyTorch tensor
y_ALL_np = y_ALL.numpy()

# Reshape the NumPy array to (2824, 1)
y_ALL_np_reshaped = y_ALL_np.reshape(-1, 1)

# Now y_ALL_np_reshaped has shape (2824, 1)
print("y_ALL_np_reshaped.shape:", y_ALL_np_reshaped.shape)



meta_features.append(y_ALL_np_reshaped)


meta_features_np = np.hstack(meta_features)
print("meta_features.shape: ", meta_features_np.shape)

meta_features = pd.DataFrame(np.hstack(meta_features))


print(model_dict)

Model meta_model_S_GRU_linear_hidden_size1_256_dropout1_0.2_lr_0.005_20240111_220612.pth loaded successfully!
S
Model Name: meta_model_S_GRU_linear_hidden_size1_256_dropout1_0.2_lr_0.005_20240111_220612.pth
Input Size: 6
Hidden Size: 256
Dropout Rate: 0.2
X_val.shape:  torch.Size([511, 5, 6])
y_val.shape:  torch.Size([511, 1])
X_ALL.shape:  torch.Size([2574, 5, 6])
y_ALL.shape:  torch.Size([2574, 1])
model_predictions.shape:  (2574, 1)
__________________________________________________
y_ALL_np_reshaped.shape: (2574, 1)
meta_features.shape:  (2574, 2)
{'model_S': S_GRU(
  (lstm1): GRU(6, 256, batch_first=True)
  (dropout1): Dropout(p=0.2, inplace=False)
  (linear): Linear(in_features=256, out_features=1, bias=True)
)}


In [None]:
meta_features = meta_features.reset_index(drop=True)
meta_features.columns = ['meta_model', 'Close']
meta_features_reverse_df = meta_features* scaler.scale_[0] + scaler.mean_[0]
meta_features_reverse_df

Unnamed: 0,meta_model,Close
0,1466.070923,1298.940552
1,1455.370850,1307.302490
2,1470.386597,1307.555908
3,1536.008667,1307.809326
4,1484.609009,1308.316162
...,...,...
2569,4628.852539,4696.914551
2570,4659.015137,4699.195312
2571,4530.191895,4612.281250
2572,4665.945801,4564.644531


In [None]:
final_prediction = meta_features_reverse_df.iloc[-1, 0]
last_day = meta_features_reverse_df.iloc[-2, 1]

if final_prediction >=last_day:
    up_down = "predict to rise"

else:
    up_down = "predict to fall"

print(last_day, final_prediction, up_down)


4564.6445 4623.142 predict to rise
