In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [None]:
# 1. 读取少量数据（避免一次性加载全部）
# ===============================

DATA_PATH = r"C:\Users\93539\train_transaction.csv"
# 只读前若干行以加快速度（后续我们会做分层抽样）
df = pd.read_csv(DATA_PATH)
overall_fraud_rate = df["isFraud"].mean()
print(f"Overall fraud rate (full dataset): {overall_fraud_rate:.4f}")

In [None]:
# 2、标签
y = df["isFraud"]

# 只保留数值型特征（忽略 card type / card network 等类别变量）
X = df.select_dtypes(include=[np.number]).drop(columns=["isFraud"])

print("Original feature dimension:", X.shape[1])


In [None]:
# 3、缺失值填充
imputer = SimpleImputer(strategy="median")
X_imp = imputer.fit_transform(X)

# 标准化
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imp)

# 转成 torch tensor
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)


In [None]:
# 4、构建 Denoising Autoencoder
class DAE(nn.Module):
    def __init__(self, input_dim, latent_dim):
        super().__init__()

        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, latent_dim)
        )

        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, 64),
            nn.ReLU(),
            nn.Linear(64, 128),
            nn.ReLU(),
            nn.Linear(128, input_dim)
        )

    def forward(self, x):
        z = self.encoder(x)
        x_recon = self.decoder(z)
        return x_recon


In [None]:
#5、训练 DAE（加入“去噪”）

#噪声：Gaussian noise
#噪声强度：noise_std = 0.1（非常保守、安全）
#loss：MSE
#不需要 validation set（这是预训练）


# 超参数
input_dim = X_tensor.shape[1]
latent_dim = 32          # ← 你之后可以改成 32
batch_size = 1024
num_epochs = 20
learning_rate = 1e-3
noise_std = 0.1

# 数据加载
dataset = TensorDataset(X_tensor)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# 模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DAE(input_dim, latent_dim).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

# 训练
model.train()
for epoch in range(num_epochs):
    total_loss = 0.0
    for (x_batch,) in loader:
        x_batch = x_batch.to(device)

        # 加噪声
        noise = noise_std * torch.randn_like(x_batch)
        x_noisy = x_batch + noise

        # forward
        x_recon = model(x_noisy)
        loss = criterion(x_recon, x_batch)

        # backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * x_batch.size(0)

    avg_loss = total_loss / len(dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}] - Recon Loss: {avg_loss:.6f}")


In [None]:
#6、提取 latent 表示
model.eval()
with torch.no_grad():
    Z = model.encoder(X_tensor.to(device)).cpu().numpy()

print("Latent representation shape:", Z.shape)


In [None]:
#7、保存有用数据

In [None]:
import joblib

np.save("Z_latent_D16.npy", Z)
np.save("y_labels.npy", y.values)
torch.save(model.encoder.state_dict(), "dae_encoder_D16.pt")
joblib.dump(imputer, "imputer.joblib")
joblib.dump(scaler, "scaler.joblib")

In [None]:
import os
files = [
    "Z_latent_D16.npy",
    "y_labels.npy",
    "dae_encoder_D16.pt",
    "imputer.joblib",
    "scaler.joblib",
]

for f in files:
    print(f, "exists:", os.path.exists(f))
