In [1]:
!pip install hmmlearn



In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.utils import check_random_state
from scipy.special import softmax
from hmmlearn.hmm import GaussianHMM
import random

class SyntheticDataGenerator:
    def __init__(self, N, F1, F2, F3, K, seed=546):
        self.N = N
        self.F1 = F1
        self.F2 = F2
        self.F3 = F3
        self.K = K
        self.rs = check_random_state(seed)
        
    def compute_policy_with_states(self, Z_F_3, thetas, p_s):
        c1_sum, c0_sum = 0, 0
        num_states = len(p_s)
        norm_r = [-1, 0]
        for s in range(num_states):
            R1_s = thetas[s]
            R0_s = 0
            c = softmax([R0_s, R1_s])
            c1_sum += c[1] * p_s[s]
            c0_sum += c[0] * p_s[s]
        return [c0_sum, c1_sum]

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def generate_synthetic_data(self):
        model = GaussianHMM(2, init_params="")
        model.n_features = 2
        model.startprob_ = np.array([1.0, 0])
        model.transmat_ = np.array([[0.8, 0.2], [0.1, 0.9]])
        model.means_ = np.array([[1], [2.5]])
        model.covars_ = np.sqrt([[0.001], [0.01]])
        
        sequences, lengths = [], []
        for _ in range(1):
            sequences.extend(model.sample(self.N, random_state=self.rs)[0])
            lengths.append(self.N)
        
        z = np.repeat(np.asarray(sequences), 3, axis=1)
        posterior_ = model.predict_proba(sequences, lengths)
        
        true_weights = [[0.5], [3.0]]
        true_bias = [0, 0]
        true_theta = [-2, 1.5]
        
        outcomes = []
        for n in range(self.N):
            mu_1 = self._sigmoid(np.dot(true_weights[0], z[n][:self.F1]) + true_bias[0])
            mu_2 = self._sigmoid(np.dot(true_weights[1], z[n][self.F1:self.F1+self.F2]) + true_bias[1])
            
            policy_c = self.compute_policy_with_states(z[n][self.F1+self.F2:], true_theta, posterior_[n])
            prob_1 = policy_c[0] * mu_1
            prob_2 = policy_c[1] * mu_2
            
            total_prob = prob_1 + prob_2
            prob_1 /= total_prob
            prob_2 /= total_prob
            
            random.seed(n)
            u_n = np.random.choice([0, 1], p=[1 - total_prob, total_prob])
            outcomes.append(u_n)
        
        return np.array(z), np.array(outcomes), true_weights, true_bias, true_theta, posterior_

# Example usage
generator = SyntheticDataGenerator(N=50000, F1=1, F2=1, F3=1, K=2)
z, outcomes, true_weights, true_bias, true_theta, posteriors = generator.generate_synthetic_data()
print("Generated feature matrix (z):", z.shape)
print("Generated outcomes (u):", outcomes)
print("True weights (beta):", true_weights)
print("True bias:", true_bias)
print("True mixing coefficients:", true_theta)
np.save("synthetic_data_observations_new.npy", z)
np.save("synthetic_data_outcomes_new.npy", outcomes)

Generated feature matrix (z): (50000, 3)
Generated outcomes (u): [0 1 1 ... 1 1 1]
True weights (beta): [[0.5], [3.0]]
True bias: [0, 0]
True mixing coefficients: [-2, 1.5]
