In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.decomposition import KernelPCA
import matplotlib.pyplot as plt
import os
from sklearn.metrics import mean_squared_error
import ipywidgets as widgets
from IPython.display import display, clear_output
import time


###设置随机种子，以确保结果可复现

np.random.seed(42)
torch.manual_seed(42)



### 标准

class Autoencoder(nn.Module):

    def __init__(self, input_dim, encoding_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    
    
### 变分

class VariationalAutoencoder(nn.Module):

    def __init__(self, input_dim, encoding_dim):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU()
        )
        self.fc_mu = nn.Linear(128, encoding_dim)
        self.fc_logvar = nn.Linear(128, encoding_dim)
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim),
            nn.Sigmoid()  
        )

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def forward(self, x):
        h = self.encoder(x)
        mu = self.fc_mu(h)
        logvar = self.fc_logvar(h)
        z = self.reparameterize(mu, logvar)
        return self.decoder(z), mu, logvar


    
### 循环

class RecurrentAutoencoder(nn.Module):

    def __init__(self, input_dim, encoding_dim, seq_length=20):
        super(RecurrentAutoencoder, self).__init__()
        self.seq_length = seq_length
        self.encoder = nn.LSTM(input_size=input_dim//seq_length, 
                              hidden_size=encoding_dim, 
                              num_layers=1, 
                              batch_first=True)
        self.decoder = nn.LSTM(input_size=encoding_dim, 
                              hidden_size=input_dim//seq_length, 
                              num_layers=1, 
                              batch_first=True)

    def forward(self, x):
        batch_size = x.size(0)
        x = x.view(batch_size, self.seq_length, -1)  
        
        _, (h_n, _) = self.encoder(x)
        encoded = h_n.squeeze(0)
        
        decoded_input = encoded.unsqueeze(1).expand(-1, self.seq_length, -1)
        decoded_output, _ = self.decoder(decoded_input)
        decoded_output = decoded_output.contiguous().view(batch_size, -1)
        return decoded_output

    
    
### 对抗
    
class AdversarialAutoencoder(nn.Module):

    def __init__(self, input_dim, encoding_dim):
        super(AdversarialAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 256),
            nn.ReLU(),
            nn.Linear(256, input_dim)
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

        
    
### 对抗的判别器

class Discriminator(nn.Module):

    def __init__(self, encoding_dim):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(encoding_dim, 128),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.3),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, z):
        validity = self.model(z)
        return validity

### 训练标准
    
def train_standard_autoencoder(X, encoding_dim=50, epochs=50, batch_size=64, learning_rate=0.001):

    X_tensor = torch.FloatTensor(X)
    dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = Autoencoder(X.shape[1], encoding_dim)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in dataloader:
            data = batch[0]
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * data.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        if (epoch+1) % 10 == 0:
            print(f'Standard AE Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')
    
    with torch.no_grad():
        encoded_features = model.encoder(X_tensor).numpy()
    
    return encoded_features, model


### 训练变分

def train_variational_autoencoder(X, encoding_dim=50, epochs=50, batch_size=64, learning_rate=0.001):

    X_tensor = torch.FloatTensor(X)
    dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = VariationalAutoencoder(X.shape[1], encoding_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    def loss_function(recon_x, x, mu, logvar):
        BCE = nn.functional.binary_cross_entropy(recon_x, x, reduction='sum')
        KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
        return BCE + KLD
    
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in dataloader:
            data = batch[0]
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(data)
            loss = loss_function(recon_batch, data, mu, logvar)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        
        epoch_loss = running_loss / len(dataloader.dataset)
        if (epoch+1) % 10 == 0:
            print(f'VAE Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')
    
    with torch.no_grad():
        encoded_features = model.encoder(X_tensor)[0].numpy()  ### 使用均值作为编码
    
    return encoded_features, model




### 训练循环

def train_recurrent_autoencoder(X, encoding_dim=50, epochs=50, batch_size=64, learning_rate=0.001, seq_length=20):

    X_tensor = torch.FloatTensor(X)
    dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    model = RecurrentAutoencoder(X.shape[1], encoding_dim, seq_length)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        running_loss = 0.0
        for batch in dataloader:
            data = batch[0]
            optimizer.zero_grad()
            output = model(data)
            loss = criterion(output, data)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * data.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        if (epoch+1) % 10 == 0:
            print(f'Recurrent AE Epoch [{epoch+1}/{epochs}], Loss: {epoch_loss:.4f}')
    
    with torch.no_grad():
        ### 提取编码特征 (取LSTM的最后一个隐藏状态)
        encoded_features = []
        for i in range(0, len(X), batch_size):
            batch = X_tensor[i:i+batch_size]
            batch_encoded = model.encoder(batch.view(batch.size(0), model.seq_length, -1))[1][0].squeeze(0).numpy()
            encoded_features.append(batch_encoded)
        encoded_features = np.vstack(encoded_features)
    
    return encoded_features, model



###训练对抗

def train_adversarial_autoencoder(X, encoding_dim=50, epochs=50, batch_size=64, learning_rate=0.001):

    X_tensor = torch.FloatTensor(X)
    dataset = TensorDataset(X_tensor)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    ae = AdversarialAutoencoder(X.shape[1], encoding_dim)
    discriminator = Discriminator(encoding_dim)
    
    ###优化器
    optimizer_ae = optim.Adam(ae.parameters(), lr=learning_rate)
    optimizer_d = optim.Adam(discriminator.parameters(), lr=learning_rate)
    
    ### 损失函数
    criterion_ae = nn.MSELoss()
    criterion_d = nn.BCELoss()
    
    for epoch in range(epochs):
        running_ae_loss = 0.0
        running_d_loss = 0.0
        for batch in dataloader:
            data = batch[0]
            batch_size = data.size(0)
            
            # 训练判别器
            optimizer_d.zero_grad()
            
            # 真实样本 (从标准正态分布采样)
            real_labels = torch.ones(batch_size, 1)
            z_real = torch.randn(batch_size, encoding_dim)
            d_real = discriminator(z_real)
            d_real_loss = criterion_d(d_real, real_labels)
            
            # 假样本 (编码器生成)
            fake_labels = torch.zeros(batch_size, 1)
            z_fake = ae.encoder(data)
            d_fake = discriminator(z_fake.detach())
            d_fake_loss = criterion_d(d_fake, fake_labels)
            
            # 判别器总损失
            d_loss = d_real_loss + d_fake_loss
            d_loss.backward()
            optimizer_d.step()
            running_d_loss += d_loss.item()
            
            # 训练自编码器
            optimizer_ae.zero_grad()
            
            # 重构损失
            reconstructed = ae(data)
            recon_loss = criterion_ae(reconstructed, data)
            
            # 对抗损失
            z_fake = ae.encoder(data)
            d_fake = discriminator(z_fake)
            g_loss = criterion_d(d_fake, real_labels)
            
            # 总损失
            ae_loss = recon_loss + g_loss
            ae_loss.backward()
            optimizer_ae.step()
            running_ae_loss += ae_loss.item()
        
        epoch_ae_loss = running_ae_loss / len(dataloader)
        epoch_d_loss = running_d_loss / len(dataloader)
        if (epoch+1) % 10 == 0:
            print(f'Adversarial AE Epoch [{epoch+1}/{epochs}], AE Loss: {epoch_ae_loss:.4f}, D Loss: {epoch_d_loss:.4f}')
    
    with torch.no_grad():
        encoded_features = ae.encoder(X_tensor).numpy()
    
    return encoded_features, ae




def evaluate_reconstruction(X, reconstructed):

    return mean_squared_error(X, reconstructed)



def run_autoencoder_comparison(X, y, output_dir='autoencoder_results', 
                              encoding_dim=50, epochs=50, batch_size=64,
                              use_standard_ae=True, use_denoising_ae=True, 
                              use_sparse_ae=True, use_vae=True,
                              use_conv_ae=True, use_recurrent_ae=True,
                              use_adversarial_ae=True, use_kernel_pca=True,
                              evaluate=True):

    
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # 数据标准化
    print("标准化数据...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # 保存原始数据和标签
    np.save(os.path.join(output_dir, 'original_features.npy'), X_scaled)
    np.save(os.path.join(output_dir, 'labels.npy'), y)
    
    results = {}
    models = {}
    

    if use_standard_ae:
        print("\n训练标准自编码器...")
        encoded_std_ae, model_std_ae = train_standard_autoencoder(
            X_scaled, 
            encoding_dim=encoding_dim,
            epochs=epochs,
            batch_size=batch_size
        )
        results['standard_ae'] = encoded_std_ae
        models['standard_ae'] = model_std_ae
        np.save(os.path.join(output_dir, 'standard_ae_features.npy'), encoded_std_ae)
        

        if evaluate:
            with torch.no_grad():
                reconstructed = model_std_ae(torch.FloatTensor(X_scaled)).numpy()
            mse = evaluate_reconstruction(X_scaled, reconstructed)
            print(f"标准自编码器重构MSE: {mse:.6f}")
    
   
    
    if use_vae:
        print("\n训练变分自编码器...")
        encoded_vae, model_vae = train_variational_autoencoder(
            X_scaled, 
            encoding_dim=encoding_dim,
            epochs=epochs,
            batch_size=batch_size
        )
        results['vae'] = encoded_vae
        models['vae'] = model_vae
        np.save(os.path.join(output_dir, 'vae_features.npy'), encoded_vae)
        
        if evaluate:
            with torch.no_grad():
                reconstructed, _, _ = model_vae(torch.FloatTensor(X_scaled))
                reconstructed = reconstructed.numpy()
            mse = evaluate_reconstruction(X_scaled, reconstructed)
            print(f"变分自编码器重构MSE: {mse:.6f}")
    
    
    
    if use_recurrent_ae:
        print("\n训练循环自编码器...")
        seq_length = 20  ###序列长度
        encoded_recurrent_ae, model_recurrent_ae = train_recurrent_autoencoder(
            X_scaled, 
            encoding_dim=encoding_dim,
            epochs=epochs,
            batch_size=batch_size,
            seq_length=seq_length
        )
        results['recurrent_ae'] = encoded_recurrent_ae
        models['recurrent_ae'] = model_recurrent_ae
        np.save(os.path.join(output_dir, 'recurrent_ae_features.npy'), encoded_recurrent_ae)
        
        if evaluate:
            with torch.no_grad():
                reconstructed = model_recurrent_ae(torch.FloatTensor(X_scaled)).numpy()
            mse = evaluate_reconstruction(X_scaled, reconstructed)
            print(f"循环自编码器重构MSE: {mse:.6f}")
    
    
    if use_adversarial_ae:
        print("\n训练对抗自编码器...")
        encoded_adv_ae, model_adv_ae = train_adversarial_autoencoder(
            X_scaled, 
            encoding_dim=encoding_dim,
            epochs=epochs,
            batch_size=batch_size
        )
        results['adversarial_ae'] = encoded_adv_ae
        models['adversarial_ae'] = model_adv_ae
        np.save(os.path.join(output_dir, 'adversarial_ae_features.npy'), encoded_adv_ae)
        

        if evaluate:
            with torch.no_grad():
                reconstructed = model_adv_ae(torch.FloatTensor(X_scaled)).numpy()
            mse = evaluate_reconstruction(X_scaled, reconstructed)
            print(f"对抗自编码器重构MSE: {mse:.6f}")
    
    

    if evaluate:
        print("\n生成评估报告...")
        report = {}
        for method, features in results.items():

            # 自编码器重构
            model = models[method]
            with torch.no_grad():
                if method == 'vae':
                    reconstructed, _, _ = model(torch.FloatTensor(X_scaled))
                    reconstructed = reconstructed.numpy()
                else:
                    reconstructed = model(torch.FloatTensor(X_scaled)).numpy()
                
            mse = evaluate_reconstruction(X_scaled, reconstructed)
            report[method] = mse
        

        report_df = pd.DataFrame(list(report.items()), columns=['方法', '重构MSE'])
        report_df.to_csv(os.path.join(output_dir, 'reconstruction_report.csv'), index=False)
        print("重构评估报告已保存。")
    
    print("\n所有降维方法已完成！")
    print(f"结果已保存至: {output_dir}")
    return results, models, report_df if evaluate else None
   


    

def create_interactive_ui(file_path='RB99_1m_Train_10877.csv'):   ##### 训练集数据

    ### 基础参数
    encoding_dim = widgets.IntText(value=50, description='降维维度:', min=10, max=200)
    epochs = widgets.IntText(value=50, description='训练轮数:', min=10, max=200)
    batch_size = widgets.IntText(value=64, description='批次大小:', min=32, max=1024)
    

    file_path_input = widgets.Text(
        value=file_path,
        description='文件路径:',
        placeholder='输入CSV文件路径'
    )
    

    method_options = {
        '标准自编码器': widgets.Checkbox(value=True, description='标准自编码器'),
        '变分自编码器': widgets.Checkbox(value=True, description='变分自编码器'),
        '循环自编码器': widgets.Checkbox(value=True, description='循环自编码器'),
        '对抗自编码器': widgets.Checkbox(value=True, description='对抗自编码器')
    }
    


    run_button = widgets.Button(
        description='运行降维',
        button_style='success',
        tooltip='点击开始运行自编码器对比分析',
        icon='play'
    )
    
    
    output = widgets.Output()
    
    
    params_box = widgets.VBox([
        file_path_input,
        widgets.HBox([encoding_dim, epochs, batch_size]),
        widgets.VBox(list(method_options.values())),
        run_button
    ])
    
    
    def on_run_clicked(b):
        with output:
            clear_output(wait=True)
            

            file_path = file_path_input.value
            

            if not os.path.exists(file_path):
                print(f"错误: 文件 '{file_path}' 不存在！")
                return
            
            try:

                print(f"读取数据: {file_path}")
                df = pd.read_csv(file_path)
                print(f"数据形状: {df.shape}")
                
                ### 提取特征和标签
                X = df.iloc[:, 1:].values
                y = df.iloc[:, 0].values
                

                methods = {
                    'use_standard_ae': method_options['标准自编码器'].value,
                    'use_vae': method_options['变分自编码器'].value,
                    'use_recurrent_ae': method_options['循环自编码器'].value,
                    'use_adversarial_ae': method_options['对抗自编码器'].value
                }
                

                start_time = time.time()
                results, models, report = run_autoencoder_comparison(
                    X, y,
                    output_dir='autoencoder_results',
                    encoding_dim=encoding_dim.value,
                    epochs=epochs.value,
                    batch_size=batch_size.value,
                    **methods
                )
                
                end_time = time.time()
                print(f"\n全部完成！耗时: {end_time - start_time:.2f}秒")
                

                if report is not None:
                    print("\n重构误差对比:")
                    display(report.sort_values('重构MSE'))
                
            
            except Exception as e:
                print(f"运行过程中发生错误: {str(e)}")
    
    
    run_button.on_click(on_run_clicked)
        
    display(params_box, output)

create_interactive_ui()

VBox(children=(Text(value='RB99_1m_Train_10877.csv', description='文件路径:', placeholder='输入CSV文件路径'), HBox(child…

Output()