In [31]:
import numpy as np
import pandas as pd
from hmmlearn import hmm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


# Load processed data
data = pd.read_csv('processed_stock_data.csv')

# Remove rows with Volume equal to 0
data = data[data['Volume'] != 0]

# Remove rows containing NaN values
data = data.dropna()

# Remove rows containing infinite values
numeric_columns = data.select_dtypes(include=[np.number]).columns
for column in numeric_columns:
    if np.isinf(data[column]).any():
        data = data[~np.isinf(data[column])]

# Remove rows where the first three attributes are all 0
data = data[~((data['Close-Open/Open'] == 0) & (data['High-Open/Open'] == 0) & (data['Open-Low/Open'] == 0))]

# Take absolute values for 'High-Open/Open' and 'Open-Low/Open' columns
data['High-Open/Open'] = data['High-Open/Open'].abs()
data['Open-Low/Open'] = data['Open-Low/Open'].abs()


# Standardize the data
scaler = StandardScaler()
data[['Close-Open/Open', 'High-Open/Open', 'Open-Low/Open', 'Volume']] = scaler.fit_transform(data[['Close-Open/Open', 'High-Open/Open', 'Open-Low/Open', 'Volume']])

# Check the final shape of the data
print("Final shape of the data:", data.shape)

Final shape of the data: (191327, 6)


In [None]:
import time
t=time.time()
# Group the dataset by Ticker
grouped_data = data.groupby('Ticker')

# Get the list of unique Tickers after grouping
unique_tickers = list(grouped_data.groups.keys())

# Calculate the number of Tickers per fold
tickers_per_fold = len(unique_tickers) // 5

# Split the data into 5 non-overlapping subsets based on Ticker
ticker_splits = []
start_ticker = 0
for i in range(5):
    end_ticker = start_ticker + tickers_per_fold
    if i == 4:  # To include any remaining Tickers in the last fold
        end_ticker = len(grouped_data)
    ticker_splits.append(data[data['Ticker'].isin(unique_tickers[start_ticker:end_ticker])])
    start_ticker = end_ticker

# Prepare a dictionary to store mean squared errors for each hidden state number
mse_dict = {}
state_range=range(2, 10)
iter=1000
# Iterate over the possible hidden state numbers
for n_states in state_range:
    mse_list = []
    
    # Iterate over the ticker_splits
    for i in range(len(ticker_splits)):
        # Use one split for validation and the others for training
        train_data = pd.concat([ticker_splits[j] for j in range(len(ticker_splits)) if j != i])
        val_data = ticker_splits[i]

        # Get the feature matrix for training and validation
        X_train = train_data[['Close-Open/Open', 'High-Open/Open', 'Open-Low/Open', 'Volume']].values
        X_val = val_data[['Close-Open/Open', 'High-Open/Open', 'Open-Low/Open', 'Volume']].values

        # Fit the HMM model with the current hidden state number
        model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=iter)
        model.fit(X_train)

        # Predict the hidden states for the validation set
        hidden_states_val = model.predict(X_val)

        # Calculate the mean squared error for the validation set
        mse = mean_squared_error(X_val[:, 0], hidden_states_val)
        mse_list.append(mse)

    # Calculate the average mean squared error for the current hidden state number
    mse_dict[n_states] = np.mean(mse_list)
    tt=time.time()
    print(tt-t)
    t=time.time()
    print(f"Hidden state number: {n_states}, Average MSE: {np.mean(mse_list)}")
    
# Find the hidden state number with the lowest average mean squared error
optimal_hidden_states = min(mse_dict, key=mse_dict.get)
print(f"Optimal hidden state number: {optimal_hidden_states}")

In [None]:
t=time.time()

# Convert the 'Date' column to a datetime object
data['Date'] = pd.to_datetime(data['Date'])

# Sort the dataset by the 'Date' column
data = data.sort_values(by='Date')

# Reset the index of the DataFrame
data.reset_index(drop=True, inplace=True)

from sklearn.model_selection import TimeSeriesSplit

# Prepare the dataset for cross-validation
X = data[['Close-Open/Open', 'High-Open/Open', 'Open-Low/Open', 'Volume']].values

# Initialize the TimeSeriesSplit splitter with 5 splits
tscv = TimeSeriesSplit(n_splits=5)

mse_dict = {}
state_range = range(2, 4)
for n_states in state_range:
    mse_list = []

    for train_index, val_index in tscv.split(X):
        X_train, X_val = X[train_index], X[val_index]

        model = hmm.GaussianHMM(n_components=n_states, covariance_type="diag", n_iter=1000000000)
        model.fit(X_train)

        hidden_states_val = model.predict(X_val)

        mse = mean_squared_error(X_val[:, 0], hidden_states_val)
        mse_list.append(mse)

    mse_dict[n_states] = np.mean(mse_list)
    tt=time.time()
    print(tt-t)
    t=time.time()
    print(f"Hidden state number: {n_states}, Average MSE: {np.mean(mse_list)}")

optimal_hidden_states = min(mse_dict, key=mse_dict.get)
print(f"Optimal hidden state number: {optimal_hidden_states}")