# Set up

In [1]:
import torch
import pandas as pd
import numpy as np
import tqdm as tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pyro.distributions as dist

import GP_functions.Tools as Tools

In [2]:
from scipy.cluster.vq import kmeans2
from scipy.spatial import distance
from scipy.stats import qmc, multivariate_normal
from itertools import product

# Create para

In [None]:
def generate_10d_grouped_sobol_scipy(n_per_group=200, low=0.1, high=5.0, base_seed=1017):

    assert n_per_group > 0 and low < high
    groups = [(0,1), (2,3), (4,5), (6,7), (8,9)]
    batches = []

    for g_idx, (c1, c2) in enumerate(groups):

        Xg = np.ones((n_per_group, 10), dtype=float)

  
        eng = qmc.Sobol(d=2, scramble=True, seed=base_seed + g_idx)
        U = eng.random(n_per_group)


        S = qmc.scale(U, l_bounds=[low, low], u_bounds=[high, high])


        Xg[:, c1:c2+1] = S
        batches.append(Xg)

    X = np.vstack(batches)  # (5*n_per_group, 10) = (1000, 10)
    return X

# 生成数据



In [None]:
X_all = generate_10d_grouped_sobol_scipy(n_per_group=128, low=0.1, high=5.0, base_seed=2025)

In [None]:
m, n = X_all.shape


new_data = np.zeros((m, 34))


mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

for orig_col, new_cols in mapping.items():
    for new_col in new_cols:
        new_data[:, new_col] = X_all[:, orig_col]

In [None]:
np.savetxt("X_edge.txt", new_data, fmt='%0.4f')

In [None]:
def hypercube_vertices_10d_product(low=0.1, high=5.0, d=10):
    return np.array(list(product([low, high], repeat=d)), dtype=float)



In [None]:
X_corner = hypercube_vertices_10d_product()

In [None]:
X_all = X_corner

m, n = X_all.shape


new_data = np.zeros((m, 34))


mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

for orig_col, new_cols in mapping.items():
    for new_col in new_cols:
        new_data[:, new_col] = X_all[:, orig_col]

In [None]:
np.savetxt("X_corner.txt", new_data, fmt='%0.4f')

# Read Data

In [3]:
Y_data_train = pd.read_csv('Data/simulation_results_train.txt', header=None, delimiter=' ')
Y_data_train = Y_data_train.drop(columns=[Y_data_train.columns[-1]])


Y_data_test = pd.read_csv('Data/simulation_results_test.txt', header=None, delimiter=' ')
Y_data_test = Y_data_test.drop(columns=[Y_data_test.columns[-1]])

Y_data_1 = pd.read_csv('LocalDisease/simulation_results_X_1.txt', header=None, delimiter=' ')
Y_data_1 = Y_data_1.drop(columns=[Y_data_1.columns[-1]])

Y_data_1_1 = pd.read_csv('LocalDisease/simulation_results_X_1_1.txt', header=None, delimiter=' ')
Y_data_1_1 = Y_data_1_1.drop(columns=[Y_data_1_1.columns[-1]])

Y_data_1_5 = pd.read_csv('LocalDisease/simulation_results_X_1_5.txt', header=None, delimiter=' ')
Y_data_1_5 = Y_data_1_5.drop(columns=[Y_data_1_5.columns[-1]])

Y_data_2 = pd.read_csv('LocalDisease/simulation_results_X_2.txt', header=None, delimiter=' ')
Y_data_2 = Y_data_2.drop(columns=[Y_data_2.columns[-1]])

Y_data_edge = pd.read_csv('Data/simulation_results_X_edge.txt', header=None, delimiter=' ')
Y_data_edge = Y_data_edge.drop(columns=[Y_data_edge.columns[-1]])

# Standard

In [4]:
scaler = StandardScaler()
Y_data_train_standardized = pd.DataFrame(scaler.fit_transform(Y_data_train), columns=Y_data_train.columns).values

# scaler = StandardScaler()
Y_data_test_standardized = pd.DataFrame(scaler.transform(Y_data_test), columns=Y_data_test.columns).values

Y_data_1_standardized = pd.DataFrame(scaler.transform(Y_data_1), columns=Y_data_1.columns).values

Y_data_1_1_standardized = pd.DataFrame(scaler.transform(Y_data_1_1), columns=Y_data_1_1.columns).values

Y_data_1_5_standardized = pd.DataFrame(scaler.transform(Y_data_1_5), columns=Y_data_1_5.columns).values

Y_data_2_standardized = pd.DataFrame(scaler.transform(Y_data_2), columns=Y_data_2.columns).values

Y_data_edge_standardized = pd.DataFrame(scaler.transform(Y_data_edge), columns=Y_data_edge.columns).values

In [None]:
col_min = np.nanmin(Y_data_edge_standardized, axis=0)
col_max = np.nanmax(Y_data_edge_standardized, axis=0)


((Y_data_1_1_standardized >= col_min) & (Y_data_1_1_standardized <= col_max)).sum(axis=1)

In [None]:
A = (Y_data_1_5_standardized >= col_min) & (Y_data_1_5_standardized <= col_max)

In [None]:
A

In [5]:
np.savetxt("LocalDisease/Y_data_1_5_std.csv", Y_data_1_5_standardized, delimiter=",", fmt="%.8f")

# Mapping back X

In [10]:
X_test = pd.read_csv('Data/X_test.txt', header=None, delimiter=' ').values


m = X_test.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_test[:, new_cols].mean(axis=1)

X_test = np.around(X_all_recovered, decimals=4)

In [11]:
X_train = pd.read_csv('Data/X_train.txt', header=None, delimiter=' ').head(Y_data_train_standardized.shape[0]).values


m = X_train.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_train[:, new_cols].mean(axis=1)

X_train = np.around(X_all_recovered, decimals=4)

In [7]:
X_1_5 = pd.read_csv('LocalDisease/X_1_5.txt', header=None, delimiter=' ').values


m = X_1_5.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_1_5[:, new_cols].mean(axis=1)

X_1_5 = np.around(X_all_recovered, decimals=4)

In [9]:
np.savetxt("LocalDisease/X_1_5.csv", X_1_5, delimiter=",", fmt="%.4f")

# Outlier

In [12]:
outlier_indices_train = Tools.get_outlier_indices_iqr(Y_data_train_standardized, outbound = 6.5)
outlier_indices_test = Tools.get_outlier_indices_iqr(Y_data_test_standardized, outbound = 6.5)

Y_data_train_standardized = np.delete(Y_data_train_standardized, outlier_indices_train, axis=0)
X_train = np.delete(X_train, outlier_indices_train, axis=0)

Y_data_test_standardized = np.delete(Y_data_test_standardized, outlier_indices_test, axis=0)
X_test = np.delete(X_test, outlier_indices_test, axis=0)

In [13]:
front_indices = np.arange(6)
back_indices = np.arange(-380, 0)

X_test_front = X_test[front_indices]
Y_test_front = Y_data_test_standardized[front_indices]
X_test_back = X_test[back_indices]
Y_test_back = Y_data_test_standardized[back_indices]

X_train = np.concatenate((X_train, X_test_front, X_test_back), axis=0)
Y_data_train_standardized = np.concatenate((Y_data_train_standardized, Y_test_front, Y_test_back), axis=0)

X_test = np.delete(X_test, np.concatenate((front_indices, back_indices)), axis=0)
Y_data_test_standardized = np.delete(Y_data_test_standardized, np.concatenate((front_indices, back_indices)), axis=0)

In [None]:
B = Y_data_test_standardized[:, 1:]  # 去掉第一列

In [None]:
mask = np.isin(np.arange(B.shape[1]) % 17, [2, 8])  # 每17列一个周期，取周期内第3、9列(0-based: 2,8)
out = B[:, mask]

In [None]:
np.savetxt("X_train_anterior.csv", X_train[:,0:2], delimiter=",", fmt="%.8f")

# PCA

In [14]:
def _sign_flip_scores(components, scores):

    comps = components.copy()
    Z = scores.copy()
    for i in range(comps.shape[0]):
        j = np.argmax(np.abs(comps[i]))  # 该成分绝对值最大的载荷索引
        if comps[i, j] < 0:              # 若为负则整体翻转
            comps[i] *= -1
            Z[:, i] *= -1
    return comps, Z

def split_and_apply_pca(train_data, test_data, variance_threshold=0.999,
                        svd_solver='full', random_state=0):

    # 1) 拆分第一列
    train_first_col = train_data[:, 0].reshape(-1, 1)
    test_first_col  = test_data[:, 0].reshape(-1, 1)

    train_remaining = train_data[:, 1:]
    test_remaining  = test_data[:, 1:]

    # 2) 先用完整 PCA 拟合以取累计方差（确定性求解器 & 固定 random_state）
    pca_full = PCA(svd_solver=svd_solver, random_state=random_state)
    pca_full.fit(train_remaining)
    cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

    # 3) 选取主成分个数
    n_components = int(np.argmax(cumulative_variance >= variance_threshold) + 1)

    # 4) 用 n_components 重新拟合（同样保证确定性）
    pca = PCA(n_components=n_components, svd_solver=svd_solver, random_state=random_state)
    train_scores = pca.fit_transform(train_remaining)  # Z_train
    test_scores  = pca.transform(test_remaining)       # Z_test

    # 5) 进行“符号固定”，消除 ± 号的随机性
    comps_fixed, train_scores_fixed = _sign_flip_scores(pca.components_, train_scores)
    _,           test_scores_fixed  = _sign_flip_scores(pca.components_, test_scores)

    # （可选）若你希望把固定后的 components 回写给 pca 对象，可取消下面两行注释：
    # pca.components_ = comps_fixed
    # 注意：scikit-learn 并不依赖 components_ 的符号唯一性，回写仅用于记录

    # 6) 合并回第一列
    train_final = np.hstack((train_first_col, train_scores_fixed))
    test_final  = np.hstack((test_first_col,  test_scores_fixed))

    return train_final, test_final, n_components


In [15]:
Y_train_final, Y_test_final, n_components = split_and_apply_pca(
    Y_data_train_standardized,
    Y_data_1_5_standardized,
    variance_threshold=0.999
)

In [16]:
n_components

20

In [None]:
Y_train_final

# Save data

In [17]:
# np.savetxt("X_train.csv", X_train, delimiter=",", fmt="%.4f")
# np.savetxt("X_test.csv", X_test, delimiter=",", fmt="%.4f")

# np.savetxt("LocalDisease/Y_train_std_pca.csv", Y_train_final, delimiter=",", fmt="%.8f")
np.savetxt("LocalDisease/Y_data_1_5_pca.csv", Y_test_final, delimiter=",", fmt="%.8f")

LocalDisease/

# End