# Set up

In [None]:
import pandas as pd
import numpy as np
import tqdm as tqdm

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
pca = PCA()

pca.fit(Y_data_train_standardized)

cumulative_variance = np.cumsum(pca.explained_variance_ratio_)

plt.figure(figsize=(8, 5))
plt.plot(range(1, len(cumulative_variance) + 1), cumulative_variance, marker='o', linestyle='--')

plt.grid()
plt.show()

In [None]:
def split_and_apply_pca(train_data, test_data, variance_threshold=0.999):

    # 拆分第一列
    train_first_col = train_data[:, 0].reshape(-1, 1)
    test_first_col = test_data[:, 0].reshape(-1, 1)
    
    train_remaining = train_data[:, 1:]
    test_remaining = test_data[:, 1:]
    
    # 初始化 PCA 并拟合剩余的列
    pca = PCA()
    pca.fit(train_remaining)
    
    # 计算累计方差
    cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
    
    # 确定主成分个数
    n_components = np.argmax(cumulative_variance >= variance_threshold) + 1
    
    # 根据主成分数重新拟合 PCA
    pca = PCA(n_components=n_components)
    train_reduced = pca.fit_transform(train_remaining)
    test_reduced = pca.transform(test_remaining)
    
    # 合并第一列与降维后的数据
    train_final = np.hstack((train_first_col, train_reduced))
    test_final = np.hstack((test_first_col, test_reduced))
    
    return train_final, test_final, n_components