In [1]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import GRU
from sklearn.metrics import accuracy_score
import xgboost as xgb
from tensorflow.keras.layers import Input, LayerNormalization, MultiHeadAttention, Dropout
from tensorflow.keras.models import Model

In [None]:
# Define the file path and data types
file_path = "data/preprocessed_stock_data.csv"
dtype_dict = {'Ticker': 'category', 'Close': 'float32', 'Volume': 'float32', 'Price_Change': 'float32', 
              'Daily_Return': 'float32', 'Volatility': 'float32', 'MA_5': 'float32', 'MA_10': 'float32'}

# Load the data
df = pd.read_csv(
    file_path,
    na_values=['null'],
    index_col='Date',
    parse_dates=True,
    infer_datetime_format=True,
    dtype=dtype_dict
)

# Display basic information about the dataset
print("First few rows of dataset:")
print(df.head())

print("Dataset information:")
print(df.info())

# Check for the shape of the dataframe and missing values
print("Dataframe Shape: ", df.shape)
print("Null Values Present: ", df.isnull().values.any())


In [None]:
# Group the data by Ticker and plot the 'Close' prices
grouped = df.groupby('Ticker')

# Plot 'Close' values for each 'Ticker'
for ticker, data in grouped:
    plt.figure(figsize=(10, 5))  # Set figure size for each plot
    data['Close'].plot(title=f"Closing prices for {ticker}")
    plt.xlabel("Date")
    plt.ylabel("Close")
    plt.grid()
    plt.show()


In [None]:
features= ['Volume', 'Price_Change', 'Daily_Return', 'Volatility','MA_5', 'MA_10']
scaler = MinMaxScaler()
# Check for NaN or infinite values
print("Checking for NaN or infinite values in the data...")
print(df[features].isnull().sum())  # Check for NaN values
print((df[features] == np.inf).sum())  # Check for infinity values
print((df[features] == -np.inf).sum())  # Check for negative infinity values

# Replace or drop NaN or infinite values
df[features] = df[features].replace([np.inf, -np.inf], np.nan)  # Replace infinities with NaN
df = df.dropna(subset=features)  # Drop rows with NaN values

# Now scale the data
scaled_data_list = []  # Use a list to store data for faster concatenation

for ticker, group in df.groupby('Ticker'):
    # Scale features within each ticker group
    scaled_features = scaler.fit_transform(group[features])
    
    # Create a DataFrame for the scaled features
    scaled_group = pd.DataFrame(
        data=scaled_features,
        columns=features,
        index=group.index
    )
    
    # Add the 'Ticker' column back to the scaled group
    scaled_group['Ticker'] = ticker
    
    # Append the scaled group to the list
    scaled_data_list.append(scaled_group)

# Concatenate all scaled groups at once
scaled_data = pd.concat(scaled_data_list)

# Display the first few rows of the scaled data
print("Scaled data (first few rows):")
print(scaled_data.head())


In [None]:
# Sort the DataFrame by 'Ticker' and 'Date' to maintain time order
df = df.sort_values(by=['Ticker', 'Date'])
# Initialize TimeSeriesSplit with 10 splits
timesplit = TimeSeriesSplit(n_splits=10)
output_var = ['Price_Change']
# Initialize lists to store the train and test data for each split
X_train_list, X_test_list, y_train_list, y_test_list = [], [], [], []

# Perform time series split
for train_index, test_index in timesplit.split(df):
    # Use loc instead of iloc for clarity with index handling
    train_data, test_data = df.iloc[train_index], df.iloc[test_index]
    
    # Extract features and target variables for train and test
    X_train_list.append(train_data[features].values)  # Convert to NumPy for efficiency
    X_test_list.append(test_data[features].values)
    y_train_list.append(train_data[output_var].values)
    y_test_list.append(test_data[output_var].values)

# Concatenate all splits into single arrays
X_train_all = np.vstack(X_train_list)  # Stack arrays vertically
X_test_all = np.vstack(X_test_list)
y_train_all = np.concatenate(y_train_list)
y_test_all = np.concatenate(y_test_list)

# Print the shapes to verify
print("X_train_all shape:", X_train_all.shape)
print("X_test_all shape:", X_test_all.shape)
print("y_train_all shape:", y_train_all.shape)
print("y_test_all shape:", y_test_all.shape)


In [None]:
# Reshape the data for LSTM
X_train_all = X_train_all.reshape(X_train_all.shape[0], 1, X_train_all.shape[1])  # Add time step dimension (1)
X_test_all = X_test_all.reshape(X_test_all.shape[0], 1, X_test_all.shape[1])  # Add time step dimension (1)

# Now define the LSTM model
lstm = Sequential()
lstm.add(LSTM(32, input_shape=(1, X_train_all.shape[2]), activation='relu', return_sequences=False))
lstm.add(Dense(1))

# Compile the model
lstm.compile(loss='mean_squared_error', optimizer='adam')

# Display the model summary
plot_model(lstm, show_shapes=True, show_layer_names=True)
lstm.summary()


In [None]:
# Add callbacks for optimization
early_stopping = EarlyStopping(
    monitor='loss', 
    patience=5,  # Stop training if loss doesn't improve for 5 epochs
    restore_best_weights=True
)
reduce_lr = ReduceLROnPlateau(
    monitor='loss', 
    factor=0.5, 
    patience=3,  # Reduce learning rate if loss doesn't improve for 3 epochs
    min_lr=1e-6
)

# Train the model with fewer epochs and callbacks
history = lstm.fit(
    X_train_all, 
    y_train_all, 
    epochs=50,  # Reduce epochs
    batch_size=16,  # Larger batch size for faster training
    verbose=1, 
    shuffle=False, 
    callbacks=[early_stopping, reduce_lr]
)


LSTM MODEL

In [None]:
# LSTM Prediction
y_pred_all = lstm.predict(X_test_all, batch_size=8, verbose=1)

# Flatten the predicted values and actual values for comparison
y_pred_all = y_pred_all.flatten()  # Flatten predictions
y_test_all = y_test_all.flatten()  # Flatten true values (if necessary)

# Print the first few predictions and corresponding actual values for verification
print("First 10 predictions:", y_pred_all[:10])
print("First 10 actual values:", y_test_all[:10])

# Calculate performance metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test_all, y_pred_all)
mae = mean_absolute_error(y_test_all, y_pred_all)
r2 = r2_score(y_test_all, y_pred_all)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)
print("R-squared (R2 Score):", r2)


GRU MODEL

In [None]:
#GRU Model Intergration
#Reshape the date for GRU
X_train_all_gru = X_train_all.reshape(X_train_all.shape[0], X_train_all.shape[1], X_train_all.shape[2])

#Define the Gru model
gru = Sequential()
gru.add(GRU(32, input_shape=(X_train_all_gru.shape[1], X_train_all_gru.shape[2]), activation='relu', return_sequences=False))
gru.add(Dense(1))

#Compile the gru model
gru.compile(optimizer='adam', loss='mean_squared_error')

In [None]:
#Gru prediction
y_pred_gru = gru.predict(X_test_all.reshape(X_test_all.shape[0], X_test_all.shape[1], X_test_all.shape[2]))
y_pred_gru = y_pred_gru.flatten()

#Calculate GRU performance metrics
mse_gru = mean_squared_error(y_test_all, y_pred_gru)
mae_gru = mean_absolute_error(y_test_all, y_pred_gru)
r2_gru = r2_score(y_test_all, y_pred_gru)

print("GRU Mean Squared Error:", mse_gru)
print("GRU Mean Absolute Error:", mae_gru)
print("GRU R-squared:", r2_gru)

TRANSFORMER

In [12]:
#Transformer model integration
#define the transformer model
def transformer_model (input_shape):
    inputs = Input(shape=input_shape)
    x = MultiHeadAttention(num_heads=2, key_dim=16)(inputs, inputs)
    x = LayerNormalization()(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.2)(x)
    outputs = Dense(1)(x)
    
    model = Model(inputs, outputs)
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

transformer = transformer_model((X_train_all.shape[1], X_train_all.shape[2]))

#Train the model
history_transformer = transformer.fit(X_train_all,y_train_all, epochs=50, batch_size=16, verbose=1)

#calculate
y_pred_transformer = transformer.predict(X_test_all)

#calculate performance
mse_xgb = mean_squared_error(y_test_all,y_pred_transformer)
mae_xgb = mean_absolute_error(y_test_all,y_pred_transformer)
r2_xgb = r2(y_test_all,y_pred_transformer)

print("XGBoost Mean Squared Error:", mse_xgb)
print("XGBoost Mean Absolute Error:", mae_xgb)
print("XGBoost R-Squared:", r2_xgb)

[1m15490/21537[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m17s[0m 3ms/step - loss: 0.0092

KeyboardInterrupt: 

XGBOOST 

In [None]:
#XGBoost Model Intergration
#Prepare dât for model
X_train_xgb = X_train_all.reshape(X_train_all.shape[0],X_train_all.shape[1], X_train_all.shape[2])
X_test_xgb = X_test_all.reshape(X_test_all.shape[0],X_test_all.shape[2])

#create dmtraix for model
dtrain = xgb.DMatrix(X_train_xgb,label=y_train_all)
dtest = xgb.DMatrix(X_test_xgb,label=X_test_all)

#set parameters for model
params={
    'objective':'reg:squarederror',
    'max_depth':6,
    'learning_rate':0.1,
    'n_estimators':100,
    'eval_metric':'rmse'
}

#train
xgb_model=xgb.train(params,dtrain)

#prediction
y_pred_xgb = xgb_model.predict(dtest)

#calculate performance
mse_xgb = mean_squared_error(y_test_all,y_pred_xgb)
mae_xgb = mean_absolute_error(y_test_all,y_pred_xgb)
r2_xgb = r2(y_test_all,y_pred_xgb)

print("XGBoost Mean Squared Error:", mse_xgb)
print("XGBoost Mean Absolute Error:", mae_xgb)
print("XGBoost R-Squared:", r2_xgb)

Main

In [None]:
# Create a map of indices for each ticker
ticker_indices = {}
start_index = 0
for ticker, group in grouped:
    group_len = len(group)
    ticker_indices[ticker] = range(start_index, start_index + group_len)
    start_index += group_len

# LSTM Prediction
y_pred_all = lstm.predict(X_test_all, batch_size=8, verbose=0).flatten()  # Flatten predictions

# Plot Predicted vs True for each ticker
for ticker, indices in ticker_indices.items():
    # Extract valid indices (ensure they are within the test set range)
    valid_indices = [idx for idx in indices if idx < len(y_test_all)]

    if not valid_indices:
        print(f"No valid indices for ticker {ticker}. Skipping.")
        continue

    # Extract true and predicted values for the current ticker
    y_test_ticker = y_test_all[valid_indices]
    y_pred_ticker = y_pred_all[valid_indices]

    # Plot the true and predicted values
    plt.figure(figsize=(10, 6))
    plt.plot(y_test_ticker, label=f'True Value for {ticker}', color='blue', linestyle='--')
    plt.plot(y_pred_ticker, label=f'LSTM Predicted Value for {ticker}', color='red', linestyle='-')
    plt.title(f"Prediction by LSTM for {ticker}")
    plt.xlabel('Time Scale')
    plt.ylabel('Scaled Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


In [None]:
# Vẽ đồ thị GRU
for ticker, indices in ticker_indices.items():
    valid_indices = [idx for idx in indices if idx < len(y_test_all)]
    
    if not valid_indices:
        print(f"No valid indices for ticker {ticker}. Skipping.")
        continue

    y_test_ticker = y_test_all[valid_indices]
    y_pred_ticker = y_pred_gru[valid_indices]

    plt.figure(figsize=(10, 6))
    plt.plot(y_test_ticker, label=f'True Value for {ticker}', color='blue', linestyle='--')
    plt.plot(y_pred_ticker, label=f'GRU Predicted Value for {ticker}', color='green', linestyle='-')
    plt.title(f"Prediction by GRU for {ticker}")
    plt.xlabel('Time Scale')
    plt.ylabel('Scaled Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Vẽ đồ thị XGBoost
for ticker, indices in ticker_indices.items():
    valid_indices = [idx for idx in indices if idx < len(y_test_all)]
    
    if not valid_indices:
        print(f"No valid indices for ticker {ticker}. Skipping.")
        continue

    y_test_ticker = y_test_all[valid_indices]
    y_pred_ticker = y_pred_xgb[valid_indices]

    plt.figure(figsize=(10, 6))
    plt.plot(y_test_ticker, label=f'True Value for {ticker}', color='blue', linestyle='--')
    plt.plot(y_pred_ticker, label=f'XGBoost Predicted Value for {ticker}', color='orange', linestyle='-')
    plt.title(f"Prediction by XGBoost for {ticker}")
    plt.xlabel('Time Scale')
    plt.ylabel('Scaled Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

In [None]:
# Vẽ đồ thị Transformer
for ticker, indices in ticker_indices.items():
    valid_indices = [idx for idx in indices if idx < len(y_test_all)]
    
    if not valid_indices:
        print(f"No valid indices for ticker {ticker}. Skipping.")
        continue

    y_test_ticker = y_test_all[valid_indices]
    y_pred_ticker = y_pred_transformer[valid_indices]

    plt.figure(figsize=(10, 6))
    plt.plot(y_test_ticker, label=f'True Value for {ticker}', color='blue', linestyle='--')
    plt.plot(y_pred_ticker, label=f'Transformer Predicted Value for {ticker}', color='purple', linestyle='-')
    plt.title(f"Prediction by Transformer for {ticker}")
    plt.xlabel('Time Scale')
    plt.ylabel('Scaled Value')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()