In [None]:
import numpy as np
import pandas as pd
from pandas import read_csv
from sklearn.preprocessing import MinMaxScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, ConstantKernel
import pywt

def preprocess(dataset):
    scalers = MinMaxScaler(feature_range=(0, 1))
    scaled = scalers.fit_transform(dataset)
    return scaled, scalers

def apply_wavelet_denoising(data):
    data = np.nan_to_num(data)  # 替换NaN值为0，适用于小波变换
    if len(data) < 16:  # 检查数据长度是否充足
        return data  # 如果太短，则返回原始数据
    coeffs = pywt.wavedec(data, 'db1', level=1)  # 使用Daubechies小波
    threshold = 0.5 * np.max([np.max(i) if len(i) > 0 else 0 for i in coeffs[1:]])
    coeffs[1:] = [pywt.threshold(i, value=threshold, mode='soft') for i in coeffs[1:]]
    reconstructed = pywt.waverec(coeffs, 'db1')
    return reconstructed

def apply_gaussian_process(data):
    data = np.nan_to_num(data)  # 替换 NaN 值
    X = np.arange(len(data)).reshape(-1, 1)
    kernel = ConstantKernel(1.0, (1e-10, 1e5)) * RBF(length_scale=1.0, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=0.1, noise_level_bounds=(1e-10, 1e1))
    gp = GaussianProcessRegressor(kernel=kernel, n_restarts_optimizer=20, alpha=1e-6)
    gp.fit(X, data)
    data_pred, _ = gp.predict(X, return_std=True)
    return data_pred

# 以下函数用于从数据集中提取电压、电流、温度和容量
def extract_VIT_capacity(x_datasets, y_datasets, seq_len, hop, sample, v=False, II=False, t=False, c=False):
    V, I, T, C = [], [], [], []
    x, y = [], []
    SS = []

    for x_data, y_data in zip(x_datasets, y_datasets):
        x_df = read_csv(x_data).dropna()
        x_df = x_df[['cycle', 'voltage_battery', 'current_battery', 'temp_battery']]
        x_df = x_df[x_df['cycle'] != 0].reset_index(drop=True)

        y_df = read_csv(y_data).dropna()
        y_df['cycle_idx'] = y_df.index + 1
        y_df = y_df[['capacity', 'cycle_idx']].values.astype('float32')
        y_len = len(y_df)

        data_len = np.int32(np.floor((y_len - seq_len - 1) / hop)) + 1

        for i in range(y_len):
            cy = x_df.cycle.unique()[i]
            df = x_df.loc[x_df['cycle'] == cy]

            cap = np.array([y_df[i, 0]])
            C.append(cap)
            df_C = pd.DataFrame(C).values
            scaled_C, scaler_C = preprocess(df_C)
            scaled_C = apply_gaussian_process(apply_wavelet_denoising(scaled_C.flatten()))

            le = len(df['voltage_battery']) % sample
            if v:
                vTemp = df['voltage_battery'].to_numpy()
                if le != 0:
                    vTemp = vTemp[:-le]
                vTemp = np.reshape(vTemp, (len(vTemp) // sample, sample)).mean(axis=1)
                V.append(vTemp)
                df_V = pd.DataFrame(V).values
                scaled_V, scaler = preprocess(df_V)
                scaled_V = apply_gaussian_process(apply_wavelet_denoising(scaled_V.flatten()))

            if II:
                iTemp = df['current_battery'].to_numpy()
                if le != 0:
                    iTemp = iTemp[:-le]
                iTemp = np.reshape(iTemp, (len(iTemp) // sample, sample)).mean(axis=1)
                I.append(iTemp)
                df_I = pd.DataFrame(I).values
                scaled_I, scaler = preprocess(df_I)
                scaled_I = apply_gaussian_process(apply_wavelet_denoising(scaled_I.flatten()))

            if t:
                tTemp = df['temp_battery'].to_numpy()
                if le != 0:
                    tTemp = tTemp[:-le]
                tTemp = np.reshape(tTemp, (len(tTemp) // sample, sample)).mean(axis=1)
                T.append(tTemp)
                df_T = pd.DataFrame(T).values
                scaled_T, scaler = preprocess(df_T)
                scaled_T = apply_gaussian_process(apply_wavelet_denoising(scaled_T.flatten()))

        for i in range(data_len):
            if v:
                x.append(scaled_V[(hop * i):(hop * i + seq_len)])
            if II:
                x.append(scaled_I[(hop * i):(hop * i + seq_len)])
            if t:
                x.append(scaled_T[(hop * i):(hop * i + seq_len)])
            if c:
                x.append(scaled_C[(hop * i):(hop * i + seq_len)])

        for i in range(data_len):
            y.append(scaled_C[hop * i + seq_len])

    return np.array(x), np.array(y), scaler_C