In [2]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
from hmmlearn.hmm import CategoricalHMM, GaussianHMM
from scipy.stats import norm
from scipy import signal

In [3]:
dec_factor=10
num_states = 20
zeropadding=False

In [4]:
Carl_all_rating_pd=pd.read_pickle("../data/CBP_rating.pkl")
PID_list=[]
Carl_all_rating=[]
for index,raw in Carl_all_rating_pd.iterrows():
    if raw['PID'] not in PID_list:
        PID_list.append(raw['PID'])
    else:
        continue
    if raw['D1_rating']!=[]:
        Carl_all_rating.append(raw['D1_rating'])
    if raw['D2_rating']!=[]:
        Carl_all_rating.append(raw['D2_rating'])
    if raw['D3_rating']!=[]:
        Carl_all_rating.append(raw['D3_rating'])
    if raw['D4_rating']!=[]:
        Carl_all_rating.append(raw['D4_rating'])
# Carl_rating_AllinOne = np.array([item for part in Carl_all_rating for item in part])
if zeropadding==True:
    Carl_rating_AllinOne = [item for part in Carl_all_rating for item in part+[0]*80]
else:
    Carl_rating_AllinOne = [item for part in Carl_all_rating for item in part]
Carl_rating_AllinOne_dec=signal.decimate(Carl_rating_AllinOne, dec_factor)

In [5]:
# Reshape the observed data to a 2D array (required by hmmlearn)
observed_data = Carl_rating_AllinOne_dec.reshape(-1, 1)

# Create and train the Gaussian HMM model
# num_states = 10
model = GaussianHMM(n_components=num_states, n_iter=1000)
model.fit(observed_data)

GaussianHMM(n_components=20, n_iter=1000)

In [6]:
model_df=pd.DataFrame({
    "model":[model]
})
if zeropadding==True:
    zp='_zp'
else:
    zp=''
filename="GHMM_s"+str(num_states)+"_dec"+str(dec_factor)+zp+'.pkl'
model_df.to_pickle("../data/model_dataset/"+filename)

In [111]:
# Predict the hidden states of the observations
predicted_states = model.predict(observed_data)

In [7]:
# observed_data = Carl_rating_AllinOne.reshape(-1, 1)
log_likelihood = model.score(observed_data)
print("log likelihood:", log_likelihood)

# Get the log-likelihood of the data given the model
log_likelihood = model.score(observed_data)

# Get the number of parameters in the model
num_params = model.n_features * model.n_components + (model.n_components - 1) * model.n_components

# Calculate the Akaike Information Criterion (AIC)
aic = -2 * log_likelihood + 2 * num_params

# Calculate the Bayesian Information Criterion (BIC)
bic = -2 * log_likelihood + num_params * np.log(len(observed_data))

# Display the AIC and BIC
print("Akaike Information Criterion (AIC):", aic)
print("Bayesian Information Criterion (BIC):", bic)

log likelihood: -162449.66074246092
Akaike Information Criterion (AIC): 325699.32148492185
Bayesian Information Criterion (BIC): 329401.50117875205


In [None]:
# Time Series Plot with Hidden States
plt.figure(figsize=(30, 6))
# plt.plot(observed_data, label='Observations')
plt.scatter(np.arange(len(observed_data)), observed_data, c=predicted_states, cmap='rainbow', label='Hidden States',s=0.5)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time Series with Inferred Hidden States')
plt.legend()
plt.show()

In [None]:
# Time Series Plot with Hidden States
plt.figure(figsize=(20, 6))
# plt.plot(observed_data, label='Observations')
plt.scatter(np.arange(len(observed_data)), observed_data, c=predicted_states, cmap='rainbow', label='Hidden States',s=1)
plt.xlabel('Time')
plt.ylabel('Value')
plt.title('Time Series with Inferred Hidden States')
plt.legend()
plt.show()

In [None]:
# State Transition Matrix
# sns.heatmap(model.transmat_, annot=True, cmap='Blues')
sns.heatmap(model.transmat_, cmap='Blues')
plt.xlabel('To State')
plt.ylabel('From State')
plt.title('State Transition Matrix')
plt.show()

In [None]:
# State Duration Distribution
state_durations = np.diff(np.where(np.diff(np.concatenate(([False], predicted_states == 0, [False]))))[0])
plt.hist(state_durations, bins=range(1, max(state_durations) + 1), align='left', rwidth=0.8)
plt.xlabel('State Duration')
plt.ylabel('Frequency')
plt.title('State Duration Distribution')
plt.show()

In [None]:
# State Emission Distributions
state_means = model.means_.flatten()
state_std_devs = np.sqrt(model.covars_.flatten())

plt.bar(np.arange(num_states), state_means, yerr=state_std_devs, align='center', alpha=0.7, capsize=5)
plt.xlabel('Hidden State')
plt.ylabel('Mean of Emission Distribution')
plt.title('State Emission Distributions')
plt.xticks(np.arange(num_states), labels=['1', '2','3','4','5','6','7','8','9','10'])
plt.show()

In [None]:
# Get the means and covariances of the Gaussian emission distributions from the trained model
means = model.means_
covars = model.covars_
# Reshape the observed data to a 1D array (if it's not already)
observed_data = observed_data.ravel()
# Create a figure to hold the plots of emission distributions
num_states = len(means)
fig, axs = plt.subplots(num_states, figsize=(8, 6 * num_states))
# Plot the Gaussian emission distribution for each hidden state
for state in range(num_states):
    # Compute the probability density function (PDF) using the Gaussian parameters
    x = np.linspace(min(observed_data), max(observed_data), 1000)
    pdf = norm.pdf(x, loc=means[state], scale=np.sqrt(covars[state]))
    pdf=np.reshape(pdf,(1000))
    # Plot the histogram of observed data
    axs[state].hist(observed_data, bins=20, density=True, alpha=0.6, label='Observed Data')

    # Plot the Gaussian PDF
    axs[state].plot(x, pdf, color='red', label=f'Gaussian (State {state + 1})')

    # Set plot labels and title
    axs[state].set_xlabel('Observations')
    axs[state].set_ylabel('Probability Density')
    axs[state].set_title(f'Emission Distribution - State {state + 1}')
    axs[state].legend()
plt.tight_layout()
plt.show()

In [None]:
# Get the means and covariances of the Gaussian emission distributions from the trained model
means = model.means_
covars = model.covars_

# Reshape the observed data to a 1D array (if it's not already)
# observed_data = observed_data.ravel()

# Create a figure to hold the plot of emission distributions
plt.figure(figsize=(10, 6))

# Plot the histogram of observed data
plt.hist(observed_data, bins=20, density=True, alpha=0.6, label='Observed Data')

# Plot the Gaussian PDFs for each hidden state
x = np.linspace(min(observed_data), max(observed_data), 1000)
for state in range(len(means)):
    # Compute the probability density function (PDF) using the Gaussian parameters
    pdf = norm.pdf(x, loc=means[state], scale=np.sqrt(covars[state]))
    pdf=np.reshape(pdf,(1000))
    # Plot the Gaussian PDF for the current state with a different color for each state
    plt.plot(x, pdf, label=f'Gaussian (State {state + 1})')

# Set plot labels and title
plt.xlabel('Observations')
plt.ylabel('Probability Density')
plt.title('Emission Distributions - Gaussian HMM')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Get the initial state probabilities from the trained model
initial_state_probs = model.startprob_

# Create a figure to hold the plot
plt.figure(figsize=(6, 4))

# Plot the Initial State Probabilities as a bar chart
num_states = len(initial_state_probs)
state_labels = [f'State {i+1}' for i in range(num_states)]
plt.bar(state_labels, initial_state_probs)
plt.xlabel('Hidden States')
plt.ylabel('Initial State Probabilities')
plt.title('Initial State Probabilities in Gaussian HMM')
plt.xticks(rotation=45)
plt.ylim(0, 1)  # Set the y-axis range between 0 and 1
plt.tight_layout()
plt.show()

In [None]:
# Get the log-likelihood of the data given the model
log_likelihood = model.score(observed_data)

# Get the number of parameters in the model
num_params = model.n_features * model.n_components + (model.n_components - 1) * model.n_components

# Calculate the Akaike Information Criterion (AIC)
aic = -2 * log_likelihood + 2 * num_params

# Calculate the Bayesian Information Criterion (BIC)
bic = -2 * log_likelihood + num_params * np.log(len(observed_data))

# Display the AIC and BIC
print("Akaike Information Criterion (AIC):", aic)
print("Bayesian Information Criterion (BIC):", bic)