In [None]:
import numpy as np
def generate_heterogeneous_data(w2,mu_2,n, p, imbalance_factor=10, random_seed=39):
    np.random.seed(random_seed)
    
    # Parameters for two components
    # Component 1: Standard variance, centered at 0
    mu1 = np.zeros(p)
    sigma1 = np.eye(p)
    
    # Component 2: High variance, shifted mean and scaled covariance
    mu2 = np.ones(p) * mu_2  # Shifted mean
    sigma2 = np.eye(p) * imbalance_factor  # High variance component
    
    # Define mixing proportions (say, 80% from component 1 and 20% from component 2)
    weights = [1-w2, w2]
    
    # Generate data for each component
    n1 = int(n * weights[0])
    n2 = n - n1
    X1 = np.random.multivariate_normal(mu1, sigma1, size=n1)
    X2 = np.random.multivariate_normal(mu2, sigma2, size=n2)
    
    # Combine the data
    X = np.vstack((X1, X2))
    np.random.shuffle(X)
    epsilon = np.random.normal(0, 1, size=n)
    beta = np.random.normal(0, 1, size=p)
    Y = np.dot(X, beta) + epsilon
    return X, Y, beta

# Example usage
n = 8192       # number of rows         
p = 1000        # number of columns
w2 = 0.33      # weight of the second component
mu_2 = 5       # mean of the second component
imbalance_factor = 10 # imbalance factor for the second component
random_seed = 39 # random seed for reproducibility
X, Y, beta = generate_heterogeneous_data(w2, mu_2, n, p, imbalance_factor, random_seed)
print("The simulated matrix is:")
print(X)

The simulated matrix is:
[[ 9.60202861  7.34913641 10.46704979 ...  4.51092732  4.25803758
   7.02848288]
 [ 0.2784616   0.78676902  0.48015767 ...  1.12275026 -1.07446052
  -0.90121966]
 [ 1.33068146  1.09419909 -0.68157343 ...  1.06718981  0.0259245
   1.43630967]
 ...
 [ 1.08496791  1.24762091  0.17954013 ...  0.82150075 -0.46264187
   1.89470357]
 [ 0.30990171  6.31978879  7.17868151 ...  4.73108328  4.65036315
   6.25113074]
 [-1.11382748 -0.2775839  -1.20271314 ... -0.36770772  0.29824728
   0.25907182]]
