In [1]:
import pandas as pd
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202201-202203/浙江杭州202201-202203.csv", encoding="utf-8")

In [3]:
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)

  data.fillna(method='ffill', inplace=True)


In [4]:
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
from sklearn.discriminant_analysis import StandardScaler

numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import shap
import pandas as pd

# 定义VAE模型
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # 编码器
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        # 解码器
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2_mu(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = torch.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# 定义损失函数
def loss_function(recon_x, x, mu, logvar):
    # 使用MSE作为重构损失
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    # KL散度损失
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# 训练VAE模型
def train_vae(data, input_dim, hidden_dim=64, latent_dim=16, epochs=100, batch_size=32, learning_rate=1e-3):
    model = VAE(input_dim, hidden_dim, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    data_tensor = torch.FloatTensor(data)

    dataset = torch.utils.data.TensorDataset(data_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, (inputs,) in enumerate(dataloader):
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(inputs)
            loss = loss_function(recon_batch, inputs, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {train_loss / len(dataloader.dataset)}')

    return model

# 使用VAE模型进行异常检测
def detect_anomalies(model, data, threshold=3):
    data_tensor = torch.FloatTensor(data)
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(data_tensor)
        reconstruction_error = torch.mean((recon_data - data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对异常的贡献
def explain_anomalies(model, data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 设置参数并训练VAE模型
input_dim = data_scaled_df.shape[1]
vae_model = train_vae(data_scaled_df.values, input_dim, epochs=100)

# 检测异常
anomalies, recon_error = detect_anomalies(vae_model, data_scaled_df.values)

# 将检测结果显示给用户
anomalies_df = data.copy()
anomalies_df['Reconstruction Error'] = recon_error
anomalies_df['Anomaly'] = anomalies

# 筛选出异常数据点
anomalous_data = data_scaled_df.values[anomalies]

# 解释异常值的SHAP值
shap_values = explain_anomalies(vae_model, data_scaled_df.values, anomalous_data)

# 创建一个DataFrame来保存SHAP值
shap_df = pd.DataFrame(shap_values, columns=data_scaled_df.columns)

# 将SHAP值添加到异常数据中
anomalies_df = anomalies_df[anomalies_df['Anomaly'] == True]
for column in shap_df.columns:
    anomalies_df[f'SHAP_{column}'] = shap_df[column].values

anomalies_df

Epoch 1, Loss: 8.713138850728248
Epoch 2, Loss: 7.581966686800036
Epoch 3, Loss: 7.084331886145276
Epoch 4, Loss: 6.671225108816009
Epoch 5, Loss: 6.395419762277443
Epoch 6, Loss: 6.101898807285403
Epoch 7, Loss: 6.000782617581155
Epoch 8, Loss: 5.838146760973627
Epoch 9, Loss: 5.726672522739541
Epoch 10, Loss: 5.587839495058585
Epoch 11, Loss: 5.622183730981616
Epoch 12, Loss: 5.504643705139996
Epoch 13, Loss: 5.463601212462358
Epoch 14, Loss: 5.408292945047116
Epoch 15, Loss: 5.329031166422501
Epoch 16, Loss: 5.328179536128356
Epoch 17, Loss: 5.30793609944092
Epoch 18, Loss: 5.233413977139911
Epoch 19, Loss: 5.2386975336263415
Epoch 20, Loss: 5.177080805482276
Epoch 21, Loss: 5.096973567158392
Epoch 22, Loss: 5.176800333550537
Epoch 23, Loss: 5.0758872286033805
Epoch 24, Loss: 5.03474337488168
Epoch 25, Loss: 5.045329528845731
Epoch 26, Loss: 5.012654420268329
Epoch 27, Loss: 4.996744602111686
Epoch 28, Loss: 4.9101567710827725
Epoch 29, Loss: 4.92783563333621
Epoch 30, Loss: 4.88480

Using 6574 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Epoch 99, Loss: 4.461009418706891
Epoch 100, Loss: 4.466000521839445


100%|██████████| 3/3 [00:22<00:00,  7.33s/it]


Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,...,Anomaly,SHAP_水温,SHAP_pH,SHAP_溶解氧,SHAP_高锰酸钾,SHAP_氨氮,SHAP_总磷,SHAP_总氮,SHAP_电导率,SHAP_浊度
172,浙江省,杭州市,新安江,浙闽片河流,洋溪渡,2022/1/3 8:00,Ⅰ,13.93,8.39,12.25,...,True,-0.13526,0.253369,-0.418984,-0.12759,-0.132431,-0.090362,1.851704,-0.140916,-0.104388
3031,浙江省,杭州市,千岛湖,浙闽片河流,大坝前,2022/2/11 4:00,劣Ⅴ,12.76,8.196,6.737,...,True,-0.366051,0.203201,-0.933894,-1.242201,0.057764,4.408408,-0.412648,0.193712,-0.075205
3081,浙江省,杭州市,千岛湖,浙闽片河流,大坝前,2022/2/11 20:00,劣Ⅴ,12.79,8.238,6.942,...,True,-0.303502,0.320025,-0.872712,-1.178534,0.121315,4.548388,-0.336562,0.258405,-0.009368
