In [1]:
import pandas as pd
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv")

In [3]:
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)

  data.fillna(method='ffill', inplace=True)


In [4]:
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
from sklearn.discriminant_analysis import StandardScaler

numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import shap
import pandas as pd

# 定义VAE模型
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # 编码器
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        # 解码器
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2_mu(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = torch.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# 定义损失函数
def loss_function(recon_x, x, mu, logvar):
    # 使用MSE作为重构损失
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    # KL散度损失
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# 训练VAE模型
def train_vae(data, input_dim, hidden_dim=64, latent_dim=16, epochs=100, batch_size=32, learning_rate=1e-3):
    model = VAE(input_dim, hidden_dim, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    data_tensor = torch.FloatTensor(data)

    dataset = torch.utils.data.TensorDataset(data_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, (inputs,) in enumerate(dataloader):
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(inputs)
            loss = loss_function(recon_batch, inputs, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {train_loss / len(dataloader.dataset)}')

    return model

# 使用VAE模型进行异常检测
def detect_anomalies(model, data, threshold=3):
    data_tensor = torch.FloatTensor(data)
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(data_tensor)
        reconstruction_error = torch.mean((recon_data - data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对异常的贡献
def explain_anomalies(model, data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 设置参数并训练VAE模型
input_dim = data_scaled_df.shape[1]
vae_model = train_vae(data_scaled_df.values, input_dim, epochs=100)

# 检测异常
anomalies, recon_error = detect_anomalies(vae_model, data_scaled_df.values)

# 将检测结果显示给用户
anomalies_df = data.copy()
anomalies_df['Reconstruction Error'] = recon_error
anomalies_df['Anomaly'] = anomalies

# 筛选出异常数据点
anomalous_data = data_scaled_df.values[anomalies]

# 解释异常值的SHAP值
shap_values = explain_anomalies(vae_model, data_scaled_df.values, anomalous_data)

# 创建一个DataFrame来保存SHAP值
shap_df = pd.DataFrame(shap_values, columns=data_scaled_df.columns)

# 将SHAP值添加到异常数据中
anomalies_df = anomalies_df[anomalies_df['Anomaly'] == True]
for column in shap_df.columns:
    anomalies_df[f'SHAP_{column}'] = shap_df[column].values

anomalies_df

Epoch 1, Loss: 9.15174330410401
Epoch 2, Loss: 7.82560385253715
Epoch 3, Loss: 7.1231628812283585
Epoch 4, Loss: 6.7628811303768295
Epoch 5, Loss: 6.559116306767953
Epoch 6, Loss: 6.514475739552999
Epoch 7, Loss: 6.416576657336302
Epoch 8, Loss: 6.358292516172166
Epoch 9, Loss: 6.211587988032712
Epoch 10, Loss: 6.129690112037031
Epoch 11, Loss: 6.075085430100455
Epoch 12, Loss: 5.972291950891983
Epoch 13, Loss: 5.836226843443343
Epoch 14, Loss: 5.836253877907128
Epoch 15, Loss: 5.7642737965893724
Epoch 16, Loss: 5.724190057024138
Epoch 17, Loss: 5.692211509591075
Epoch 18, Loss: 5.669039960008351
Epoch 19, Loss: 5.636902757613288
Epoch 20, Loss: 5.606406412744485
Epoch 21, Loss: 5.515956911521679
Epoch 22, Loss: 5.559553637762226
Epoch 23, Loss: 5.54487691713481
Epoch 24, Loss: 5.558673957221286
Epoch 25, Loss: 5.432712790923093
Epoch 26, Loss: 5.440015892919004
Epoch 27, Loss: 5.38345547514029
Epoch 28, Loss: 5.437566782593634
Epoch 29, Loss: 5.448428937393702
Epoch 30, Loss: 5.399895

Using 2554 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Epoch 96, Loss: 4.948775077108863
Epoch 97, Loss: 4.982486957857734
Epoch 98, Loss: 4.9987052224442285
Epoch 99, Loss: 4.957690069427132
Epoch 100, Loss: 4.977964626781252


100%|██████████| 2/2 [00:05<00:00,  2.68s/it]


Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,...,Anomaly,SHAP_水温,SHAP_pH,SHAP_溶解氧,SHAP_高锰酸钾,SHAP_氨氮,SHAP_总磷,SHAP_总氮,SHAP_电导率,SHAP_浊度
251,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/2/14 12:00,劣Ⅴ,13.13,7.47,8.58,...,True,0.291926,-0.152321,0.09101,0.17747,0.186357,4.12664,-0.177288,0.410969,0.045609
865,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/5/21 8:00,Ⅲ,21.04,7.04,8.501,...,True,0.076651,-0.141296,0.282273,0.114435,1.206885,0.058631,0.136769,1.300863,0.233195


In [10]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv")
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

  data.fillna(method='ffill', inplace=True)


In [12]:
import torch
import pandas as pd
import shap

# 定义一个函数用于加载新数据并进行异常检测
def detect_anomalies_on_new_data(model, new_data, threshold=10):
    # 将新数据转化为Tensor
    new_data_tensor = torch.FloatTensor(new_data)
    
    # 使用训练好的模型进行推理
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(new_data_tensor)
        reconstruction_error = torch.mean((recon_data - new_data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    
    # 返回检测结果和重构误差
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对新数据异常的贡献
def explain_anomalies_on_new_data(model, new_data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, new_data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 加载新的数据集（确保已经预处理，且维度与原始数据集相同）
new_data_scaled_df = data_scaled_df  # 你需要加载和预处理新的数据

# 使用训练好的模型检测新数据集中的异常
anomalies_new, recon_error_new = detect_anomalies_on_new_data(vae_model, new_data_scaled_df.values)

# 将结果显示给用户
new_anomalies_df = new_data_scaled_df.copy()
new_anomalies_df['Reconstruction Error'] = recon_error_new
new_anomalies_df['Anomaly'] = anomalies_new

# 显示异常的行
anomalous_new_data = new_data_scaled_df[anomalies_new]

# 解释新数据中异常点的SHAP值
shap_values_new = explain_anomalies_on_new_data(vae_model, new_data_scaled_df.values, anomalous_new_data.values)

# 创建一个DataFrame来保存新数据的SHAP值
shap_new_df = pd.DataFrame(shap_values_new, columns=new_data_scaled_df.columns)

# 将SHAP值添加到新数据的异常检测结果中
anomalous_new_df = new_anomalies_df[new_anomalies_df['Anomaly'] == True].copy()
for column in shap_new_df.columns:
    anomalous_new_df[f'SHAP_{column}'] = shap_new_df[column].values
print(anomalous_new_df.columns)

Using 4005 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 5/5 [00:22<00:00,  4.41s/it]

Index(['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度',
       'Reconstruction Error', 'Anomaly', 'SHAP_水温', 'SHAP_pH', 'SHAP_溶解氧',
       'SHAP_高锰酸钾', 'SHAP_氨氮', 'SHAP_总磷', 'SHAP_总氮', 'SHAP_电导率', 'SHAP_浊度'],
      dtype='object')
已生成异常数据的 CSV 文件：anomalous_data_with_shap.csv





In [16]:
anomalous_data = data[new_anomalies_df['Anomaly'] == True]

# 提取 SHAP 值
shap_columns = [f'SHAP_{col}' for col in anomalous_new_df.columns if f'SHAP_{col}' in anomalous_new_df.columns]
shap_data = anomalous_new_df[shap_columns]

# 使用 pd.concat 将原始异常数据和 SHAP 值拼接在一起
anomalous_with_shap = pd.concat([anomalous_data, shap_data], axis=1)

# 保存异常数据和SHAP值到CSV文件
anomalous_with_shap.to_csv('anomalous_data_with_shap.csv', index=False)

print("已生成异常数据的 CSV 文件：anomalous_data_with_shap.csv")

已生成异常数据的 CSV 文件：anomalous_data_with_shap.csv
