# Set up

In [1]:
import pandas as pd
import numpy as np
import tqdm as tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

import pyro.distributions as dist


# Read Data

In [2]:
Y_data_train = pd.read_csv('../Data/Data20251215/Y_data_train.csv', header=None, delimiter=',')
# Y_data_train = Y_data_train.drop(columns=[Y_data_train.columns[-1]])

Y_data_test = pd.read_csv('../Data/simulation_results_test.txt', header=None, delimiter=' ')
Y_data_test = Y_data_test.drop(columns=[Y_data_test.columns[-1]])

In [10]:
Y_data_max = pd.read_csv('simulation_results_5_max.txt', header=None, delimiter=' ')
Y_data_max = Y_data_max.drop(columns=[Y_data_max.columns[-1]])

In [2]:
Y_data_extra = pd.read_csv('../simulation_results_Extra.txt', header=None, delimiter=' ')
Y_data_extra = Y_data_extra.drop(columns=[Y_data_extra.columns[-1]])

In [3]:
RealCase = pd.read_csv('RealCase.csv', header=None, delimiter=',')

In [4]:
df_new = Y_data_train.drop(Y_data_train.columns[-18:], axis=1)
Y_data_train = df_new.drop(df_new.columns[17], axis=1)

In [5]:
df_new = Y_data_test.drop(Y_data_test.columns[-18:], axis=1)
Y_data_test = df_new.drop(df_new.columns[17], axis=1)

In [11]:
df_new = Y_data_max.drop(Y_data_max.columns[-18:], axis=1)
Y_data_max = df_new.drop(df_new.columns[17], axis=1)

In [3]:
df_new = Y_data_extra.drop(Y_data_extra.columns[-18:], axis=1)
Y_data_extra = df_new.drop(df_new.columns[17], axis=1)

In [None]:
Y_data_train.shape

In [None]:
RealCase

## Save

In [4]:
np.savetxt("RealCase_Y_extra.csv", Y_data_extra, delimiter=",", fmt="%.8f")

## Check

In [9]:
col_min = np.nanmin(Y_data_train.values, axis=0)
col_max = np.nanmax(Y_data_train.values, axis=0)


((RealCase.values <= col_min) & (RealCase.values >= col_max)).sum(axis=1)

array([0])

In [21]:
((RealCase.abs().values >= Y_data_max.abs().values)).sum(axis=1)

array([33])

# Standard

In [22]:
scaler = StandardScaler()
Y_data_train_standardized = pd.DataFrame(scaler.fit_transform(Y_data_train), columns=Y_data_train.columns).values

Y_data_test_standardized = pd.DataFrame(scaler.transform(Y_data_test), columns=Y_data_test.columns).values

RealCase_standardized = pd.DataFrame(scaler.transform(RealCase), columns=RealCase.columns).values

In [23]:
np.savetxt("RealCase_Y_train_std.csv", Y_data_train_standardized, delimiter=",", fmt="%.8f")
np.savetxt("RealCase_Y_test_std.csv", Y_data_test_standardized, delimiter=",", fmt="%.8f")
np.savetxt("RealCase_Y_std.csv", RealCase_standardized, delimiter=",", fmt="%.8f")

# Mapping back

In [24]:
X_train = pd.read_csv('../Data/Data20251215/X_train.csv', header=None, delimiter=',').values


m = X_train.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_train[:, new_cols].mean(axis=1)

X_train = np.around(X_all_recovered, decimals=4)

In [25]:
X_test = pd.read_csv('../Data/X_test.txt', header=None, delimiter=' ').values


m = X_test.shape[0]

mapping = {
    0: [4, 16],
    1: [5, 17],
    2: [10, 22],
    3: [11, 23],
    4: [8, 6, 20, 18],
    5: [9, 7, 21, 19],
    6: [2, 0, 14, 12],
    7: [3, 1, 15, 13],
    8: [24, 26, 28, 30, 32],
    9: [25, 27, 29, 31, 33]
}

X_all_recovered = np.zeros((m, len(mapping)))


for orig_col, new_cols in mapping.items():

    X_all_recovered[:, orig_col] = X_test[:, new_cols].mean(axis=1)

X_test = np.around(X_all_recovered, decimals=4)

In [26]:
np.savetxt("RealCase_X_train.csv", X_train, delimiter=",", fmt="%.4f")
np.savetxt("RealCase_X_test.csv", X_test, delimiter=",", fmt="%.4f")

# PCA

In [27]:
def _sign_flip_scores(components, scores):

    comps = components.copy()
    Z = scores.copy()
    for i in range(comps.shape[0]):
        j = np.argmax(np.abs(comps[i]))  # 该成分绝对值最大的载荷索引
        if comps[i, j] < 0:              # 若为负则整体翻转
            comps[i] *= -1
            Z[:, i] *= -1
    return comps, Z

def split_and_apply_pca(train_data, test_data, variance_threshold=0.999,
                        svd_solver='full', random_state=0):

    # 1) 拆分第一列
    train_first_col = train_data[:, 0].reshape(-1, 1)
    test_first_col  = test_data[:, 0].reshape(-1, 1)

    train_remaining = train_data[:, 1:]
    test_remaining  = test_data[:, 1:]

    # 2) 先用完整 PCA 拟合以取累计方差（确定性求解器 & 固定 random_state）
    pca_full = PCA(svd_solver=svd_solver, random_state=random_state)
    pca_full.fit(train_remaining)
    cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)

    # 3) 选取主成分个数
    n_components = int(np.argmax(cumulative_variance >= variance_threshold) + 1)

    # 4) 用 n_components 重新拟合（同样保证确定性）
    pca = PCA(n_components=n_components, svd_solver=svd_solver, random_state=random_state)
    train_scores = pca.fit_transform(train_remaining)  # Z_train
    test_scores  = pca.transform(test_remaining)       # Z_test

    # 5) 进行“符号固定”，消除 ± 号的随机性
    comps_fixed, train_scores_fixed = _sign_flip_scores(pca.components_, train_scores)
    _,           test_scores_fixed  = _sign_flip_scores(pca.components_, test_scores)

    # （可选）若你希望把固定后的 components 回写给 pca 对象，可取消下面两行注释：
    # pca.components_ = comps_fixed
    # 注意：scikit-learn 并不依赖 components_ 的符号唯一性，回写仅用于记录

    # 6) 合并回第一列
    train_final = np.hstack((train_first_col, train_scores_fixed))
    test_final  = np.hstack((test_first_col,  test_scores_fixed))

    return train_final, test_final, n_components


In [35]:
Y_train_final, Y_test_final, n_components = split_and_apply_pca(
    Y_data_train_standardized,
    Y_data_test_standardized,
    variance_threshold=0.999
)

In [36]:
n_components

18

In [31]:
col_min = np.nanmin(Y_train_final, axis=0)
col_max = np.nanmax(Y_train_final, axis=0)


((Y_test_final <= col_min) & (Y_test_final >= col_max)).sum(axis=1)

array([0])

In [37]:
np.savetxt("RealCase_Y_test_pca.csv", Y_test_final, delimiter=",", fmt="%.8f")
np.savetxt("RealCase_Y_train_pca.csv", Y_train_final, delimiter=",", fmt="%.8f")

In [38]:
Y_train_final

array([[ 1.06751074, -2.8366313 , -3.11117071, ...,  0.11966001,
         0.08812023, -0.04493808],
       [ 0.89304904, -4.86966909,  7.71401473, ..., -0.1392635 ,
        -0.07512051, -0.15587736],
       [-0.9405249 ,  3.56794833, -0.02990135, ...,  0.01995991,
         0.03363875,  0.0546165 ],
       ...,
       [ 1.09733481, -5.91654583,  1.89982143, ...,  0.07262656,
        -0.08531967,  0.30402169],
       [-1.24449218,  4.44007582,  0.25624733, ..., -0.0749383 ,
        -0.01741271,  0.01621   ],
       [-0.04388389,  1.18593112,  0.44339915, ...,  0.06201217,
         0.03118662, -0.02441286]], shape=(4558, 19))

In [34]:
Y_train_final

array([[ 1.06751074, -2.8366313 , -3.11117071, ...,  0.11966001,
         0.08812023, -0.04493808],
       [ 0.89304904, -4.86966909,  7.71401473, ..., -0.1392635 ,
        -0.07512051, -0.15587736],
       [-0.9405249 ,  3.56794833, -0.02990135, ...,  0.01995991,
         0.03363875,  0.0546165 ],
       ...,
       [ 1.09733481, -5.91654583,  1.89982143, ...,  0.07262656,
        -0.08531967,  0.30402169],
       [-1.24449218,  4.44007582,  0.25624733, ..., -0.0749383 ,
        -0.01741271,  0.01621   ],
       [-0.04388389,  1.18593112,  0.44339915, ...,  0.06201217,
         0.03118662, -0.02441286]], shape=(4558, 19))