In [38]:
import pandas as pd
import shap

In [39]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202201-202203/浙江杭州202201-202203.csv", encoding="utf-8")

In [40]:
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)

  data.fillna(method='ffill', inplace=True)


In [41]:
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
from sklearn.discriminant_analysis import StandardScaler

numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

In [42]:
import torch
import torch.nn as nn
import torch.optim as optim
import shap
import pandas as pd

# 定义VAE模型
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # 编码器
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        # 解码器
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2_mu(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = torch.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# 定义损失函数
def loss_function(recon_x, x, mu, logvar):
    # 使用MSE作为重构损失
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    # KL散度损失
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# 训练VAE模型
def train_vae(data, input_dim, hidden_dim=64, latent_dim=16, epochs=100, batch_size=32, learning_rate=1e-3):
    model = VAE(input_dim, hidden_dim, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    data_tensor = torch.FloatTensor(data)

    dataset = torch.utils.data.TensorDataset(data_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, (inputs,) in enumerate(dataloader):
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(inputs)
            loss = loss_function(recon_batch, inputs, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {train_loss / len(dataloader.dataset)}')

    return model

# 使用VAE模型进行异常检测
def detect_anomalies(model, data, threshold=3):
    data_tensor = torch.FloatTensor(data)
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(data_tensor)
        reconstruction_error = torch.mean((recon_data - data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对异常的贡献
def explain_anomalies(model, data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 设置参数并训练VAE模型
input_dim = data_scaled_df.shape[1]
vae_model = train_vae(data_scaled_df.values, input_dim, epochs=100)

# 检测异常
anomalies, recon_error = detect_anomalies(vae_model, data_scaled_df.values)

# 将检测结果显示给用户
anomalies_df = data.copy()
anomalies_df['Reconstruction Error'] = recon_error
anomalies_df['Anomaly'] = anomalies

# 筛选出异常数据点
anomalous_data = data_scaled_df.values[anomalies]

# 解释异常值的SHAP值
shap_values = explain_anomalies(vae_model, data_scaled_df.values, anomalous_data)

# 创建一个DataFrame来保存SHAP值
shap_df = pd.DataFrame(shap_values, columns=data_scaled_df.columns)

# 将SHAP值添加到异常数据中
anomalies_df = anomalies_df[anomalies_df['Anomaly'] == True]
for column in shap_df.columns:
    anomalies_df[f'SHAP_{column}'] = shap_df[column].values

anomalies_df

Epoch 1, Loss: 8.629597479643559
Epoch 2, Loss: 7.566554941955511
Epoch 3, Loss: 7.095424664865557
Epoch 4, Loss: 6.674123392084793
Epoch 5, Loss: 6.387226111699572
Epoch 6, Loss: 6.170701568018603
Epoch 7, Loss: 6.029540011757298
Epoch 8, Loss: 5.875883852706552
Epoch 9, Loss: 5.744726200179195
Epoch 10, Loss: 5.6150898273353125
Epoch 11, Loss: 5.590857738776885
Epoch 12, Loss: 5.490608874519146
Epoch 13, Loss: 5.457860604876038
Epoch 14, Loss: 5.412650336673784
Epoch 15, Loss: 5.342475762293264
Epoch 16, Loss: 5.369501474239487
Epoch 17, Loss: 5.31686490612767
Epoch 18, Loss: 5.268339495402105
Epoch 19, Loss: 5.200737314018247
Epoch 20, Loss: 5.176333576848814
Epoch 21, Loss: 5.1386406019098825
Epoch 22, Loss: 5.122383519222862
Epoch 23, Loss: 5.0727451952315405
Epoch 24, Loss: 5.075326247022331
Epoch 25, Loss: 5.0453538688657344
Epoch 26, Loss: 5.033187728106594
Epoch 27, Loss: 4.992294321968304
Epoch 28, Loss: 5.006977278590746
Epoch 29, Loss: 4.9599159653875144
Epoch 30, Loss: 4.9

Using 6574 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Epoch 100, Loss: 4.503721100238815


  0%|          | 0/5 [00:00<?, ?it/s]

Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,...,Anomaly,SHAP_水温,SHAP_pH,SHAP_溶解氧,SHAP_高锰酸钾,SHAP_氨氮,SHAP_总磷,SHAP_总氮,SHAP_电导率,SHAP_浊度
1203,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2022/1/17 12:00,Ⅳ,10.9,7.593,8.938,...,True,-0.043664,-0.227196,-0.357946,-0.155526,2.314813,0.343526,-0.356256,-0.26343,-0.091001
3572,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2022/2/18 8:00,Ⅱ,8.32,7.394,9.603,...,True,0.251215,-0.390945,-0.264927,-0.370473,-0.088375,-0.164426,-0.070055,-0.048286,9.605182
5851,浙江省,杭州市,分水江,浙闽片河流,桐君山,2022/3/22 8:00,Ⅳ,12.89,7.49,13.558,...,True,-0.042997,-0.107752,0.580942,-0.494681,-0.036308,0.195486,-0.019557,0.168412,1.163298
5864,浙江省,杭州市,分水江,浙闽片河流,桐君山,2022/3/22 12:00,Ⅲ,12.74,7.47,13.437,...,True,-0.121477,-0.156427,0.471541,-0.020355,-0.116698,-0.001628,-0.13057,0.032345,0.983658
6568,浙江省,杭州市,富春江,浙闽片河流,桐庐,2022/3/31 20:00,Ⅱ,17.0,7.38,9.751,...,True,-0.033814,-0.221161,-0.049369,0.200878,0.175088,-0.05141,0.141914,1.565025,11.912047


In [61]:
import torch
import pandas as pd
import shap
from sklearn.preprocessing import StandardScaler
# 定义一个函数用于加载新数据并进行异常检测
def detect_anomalies_on_new_data(model, new_data, threshold=10):
    # 将新数据转化为Tensor
    new_data_tensor = torch.FloatTensor(new_data)
    # 使用训练好的模型进行推理
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(new_data_tensor)
        reconstruction_error = torch.mean((recon_data - new_data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    # 返回检测结果和重构误差
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对新数据异常的贡献
def explain_anomalies_on_new_data(model, new_data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        # print(f"Reconstruction error shape: {reconstruction_error.shape}")
        return reconstruction_error.numpy()
    # 限制背景数据大小，避免计算慢
    new_data_sampled = shap.sample(new_data, 100)  # 限制背景样本大小
    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, new_data_sampled)
    # explainer = shap.KernelExplainer(model_predict, new_data)
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)
    return shap_values


data = {}
# 加载 CSV 文件
# 'gb2312', 'gbk', 'gb18030', 'utf-8'
# data[0] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv")
# data[1] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/分水江202101-202112.csv")
# data[2] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/富春江202101-202112.csv")
# data[3] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv")
# data[4] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/南苕溪202101-202112.csv")
# data[5] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/浦阳江202101-202112.csv")
# data[6] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/千岛湖202101-202112.csv")
# data[7] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/钱塘江202101-202112.csv")
# data[8] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/新安江202101-202112.csv")

# data[0] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv", encoding="utf-8")
# data[1] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/分水江202101-202112.csv", encoding="utf-8")
# data[2] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/富春江202101-202112.csv", encoding="utf-8")
# data[3] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv", encoding="utf-8")
# data[4] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/南苕溪202101-202112.csv", encoding="utf-8")
# data[5] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/浦阳江202101-202112.csv", encoding="utf-8")
# data[6] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/千岛湖202101-202112.csv", encoding="utf-8")
# data[7] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/钱塘江202101-202112.csv", encoding="utf-8")
# data[8] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/新安江202101-202112.csv", encoding="utf-8")

# data[0] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv", encoding="gb2312")
# data[1] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/分水江202101-202112.csv", encoding="gb2312")
# data[2] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/富春江202101-202112.csv", encoding="gb2312")
# data[3] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv", encoding="gb2312")
# data[4] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/南苕溪202101-202112.csv", encoding="gb2312")
# data[5] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/浦阳江202101-202112.csv", encoding="gb2312")
# data[6] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/千岛湖202101-202112.csv", encoding="gb2312")
# data[7] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/钱塘江202101-202112.csv", encoding="gb2312")
# data[8] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/新安江202101-202112.csv", encoding="gb2312")

# data[0] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv", encoding="gbk")
# data[1] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/分水江202101-202112.csv", encoding="gbk")
# data[2] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/富春江202101-202112.csv", encoding="gbk")
# data[3] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv", encoding="gbk")
# data[4] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/南苕溪202101-202112.csv", encoding="gbk")
# data[5] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/浦阳江202101-202112.csv", encoding="gbk")
# data[6] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/千岛湖202101-202112.csv", encoding="gbk")
# data[7] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/钱塘江202101-202112.csv", encoding="gbk")
# data[8] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/新安江202101-202112.csv", encoding="gbk")

data[0] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv", encoding="gb2312")
data[1] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/分水江202101-202112.csv", encoding="utf-8")
data[2] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/富春江202101-202112.csv", encoding="utf-8")
data[3] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv", encoding="gb2312")
data[4] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/南苕溪202101-202112.csv", encoding="utf-8")
data[5] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/浦阳江202101-202112.csv", encoding="utf-8")
data[6] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/千岛湖202101-202112.csv", encoding="utf-8")
data[7] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/钱塘江202101-202112.csv", encoding="utf-8")
data[8] = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/新安江202101-202112.csv", encoding="utf-8")
# 初始化存储所有异常数据的 DataFrame
# all_anomalous_with_shap = pd.DataFrame()
# 循环处理每个数据集
for water in range (0, 9):
    # 处理缺失值
    data[water].fillna(method='ffill', inplace=True)
    # 删除重复行
    data[water].drop_duplicates(inplace=True)
    # 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
    numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
    data_numeric = data[water][numeric_columns]
    # 数据标准化处理
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data_numeric)
    # 转换为Pandas DataFrame格式
    new_data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)
    # 使用训练好的模型检测新数据集中的异常
    anomalies_new, recon_error_new = detect_anomalies_on_new_data(vae_model, new_data_scaled_df.values)
    # 将结果显示给用户
    new_anomalies_df = new_data_scaled_df.copy()
    new_anomalies_df['Reconstruction Error'] = recon_error_new
    new_anomalies_df['Anomaly'] = anomalies_new
    # 显示异常的行
    anomalous_new_data = new_data_scaled_df[anomalies_new]
    if anomalous_new_data.shape[0] == 0:
        print("No anomalies detected in the data.")
        continue
    # 解释新数据中异常点的SHAP值
    shap_values_new = explain_anomalies_on_new_data(vae_model, new_data_scaled_df.values, anomalous_new_data.values)
    # 创建一个DataFrame来保存新数据的SHAP值
    shap_new_df = pd.DataFrame(shap_values_new, columns=new_data_scaled_df.columns)
    # shap_new_df = pd.DataFrame(shap_values_new, columns=new_anomalies_df.columns[:-2])  # 除去错误和异常列
    # 将SHAP值添加到新数据的异常检测结果中
    anomalous_new_df = new_anomalies_df[new_anomalies_df['Anomaly'] == True].copy()
    for column in shap_new_df.columns:
        anomalous_new_df[f'SHAP_{column}'] = shap_new_df[column].values
    # print(anomalous_new_df.columns)
    ### anomalous_data = data[new_anomalies_df['Anomaly'] == True]
    # 提取 SHAP 值
    ### shap_columns = [f'SHAP_{col}' for col in anomalous_new_df.columns if f'SHAP_{col}' in anomalous_new_df.columns]
    ### shap_data = anomalous_new_df[shap_columns]
    # 使用 pd.concat 将原始异常数据和 SHAP 值拼接在一起
    # anomalous_with_shap = pd.concat([anomalous_data, shap_data], axis=1) #列拼接
    ############### anomalous_with_shap = pd.concat([anomalous_new_df, shap_new_df], axis=1)    #列拼接
    # anomalous_with_shap = pd.concat([data[water][anomalies_new], shap_new_df], axis=1)  # 列拼接
    # anomalous_with_shap = pd.concat([data[new_anomalies_df['Anomaly'] == True], shap_new_df], axis=1)  # 列拼接
    # all_anomalous_with_shap=pd.concat([all_anomalous_with_shap, anomalous_with_shap]) #拼接数据行
    # 保存异常数据和SHAP值到CSV文件
    # all_anomalous_with_shap.to_csv('anomalous_data_with_shap.csv', index=False)
    ############### anomalous_with_shap.to_csv(f"anomalous_data_with_shap_{water}.csv", index=False)
    anomalous_new_df.to_csv(f"anomalous_data_with_shap_{water}.csv", index=False)
    print(f"已生成异常数据的 CSV 文件：anomalous_data_with_shap{water}.csv")

  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/1 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap0.csv
No anomalies detected in the data.
No anomalies detected in the data.


  data[water].fillna(method='ffill', inplace=True)
  data[water].fillna(method='ffill', inplace=True)
  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/11 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap3.csv
No anomalies detected in the data.


  data[water].fillna(method='ffill', inplace=True)
  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/1 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap5.csv


  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/24 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap6.csv


  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/1 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap7.csv


  data[water].fillna(method='ffill', inplace=True)


  0%|          | 0/1 [00:00<?, ?it/s]

已生成异常数据的 CSV 文件：anomalous_data_with_shap8.csv
