In [1]:
import pandas as pd
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/东苕溪202101-202112.csv")

In [3]:
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)

  data.fillna(method='ffill', inplace=True)


In [4]:
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
from sklearn.discriminant_analysis import StandardScaler

numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import shap
import pandas as pd

# 定义VAE模型
class VAE(nn.Module):
    def __init__(self, input_dim, hidden_dim, latent_dim):
        super(VAE, self).__init__()
        # 编码器
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2_mu = nn.Linear(hidden_dim, latent_dim)
        self.fc2_logvar = nn.Linear(hidden_dim, latent_dim)
        # 解码器
        self.fc3 = nn.Linear(latent_dim, hidden_dim)
        self.fc4 = nn.Linear(hidden_dim, input_dim)

    def encode(self, x):
        h1 = torch.relu(self.fc1(x))
        return self.fc2_mu(h1), self.fc2_logvar(h1)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h3 = torch.relu(self.fc3(z))
        return self.fc4(h3)

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# 定义损失函数
def loss_function(recon_x, x, mu, logvar):
    # 使用MSE作为重构损失
    MSE = nn.functional.mse_loss(recon_x, x, reduction='sum')
    # KL散度损失
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return MSE + KLD

# 训练VAE模型
def train_vae(data, input_dim, hidden_dim=64, latent_dim=16, epochs=100, batch_size=32, learning_rate=1e-3):
    model = VAE(input_dim, hidden_dim, latent_dim)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    data_tensor = torch.FloatTensor(data)

    dataset = torch.utils.data.TensorDataset(data_tensor)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for batch_idx, (inputs,) in enumerate(dataloader):
            optimizer.zero_grad()
            recon_batch, mu, logvar = model(inputs)
            loss = loss_function(recon_batch, inputs, mu, logvar)
            loss.backward()
            train_loss += loss.item()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {train_loss / len(dataloader.dataset)}')

    return model

# 使用VAE模型进行异常检测
def detect_anomalies(model, data, threshold=3):
    data_tensor = torch.FloatTensor(data)
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(data_tensor)
        reconstruction_error = torch.mean((recon_data - data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对异常的贡献
def explain_anomalies(model, data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 设置参数并训练VAE模型
input_dim = data_scaled_df.shape[1]
vae_model = train_vae(data_scaled_df.values, input_dim, epochs=100)

# 检测异常
anomalies, recon_error = detect_anomalies(vae_model, data_scaled_df.values)

# 将检测结果显示给用户
anomalies_df = data.copy()
anomalies_df['Reconstruction Error'] = recon_error
anomalies_df['Anomaly'] = anomalies

# 筛选出异常数据点
anomalous_data = data_scaled_df.values[anomalies]

# 解释异常值的SHAP值
shap_values = explain_anomalies(vae_model, data_scaled_df.values, anomalous_data)

# 创建一个DataFrame来保存SHAP值
shap_df = pd.DataFrame(shap_values, columns=data_scaled_df.columns)

# 将SHAP值添加到异常数据中
anomalies_df = anomalies_df[anomalies_df['Anomaly'] == True]
for column in shap_df.columns:
    anomalies_df[f'SHAP_{column}'] = shap_df[column].values

anomalies_df

Epoch 1, Loss: 8.995365910541292
Epoch 2, Loss: 7.493920292626533
Epoch 3, Loss: 6.954843740679832
Epoch 4, Loss: 6.6906494582735565
Epoch 5, Loss: 6.517691024505448
Epoch 6, Loss: 6.464109631376334
Epoch 7, Loss: 6.304992502521703
Epoch 8, Loss: 6.295664453469129
Epoch 9, Loss: 6.213053345586886
Epoch 10, Loss: 6.192025497544064
Epoch 11, Loss: 5.9571065103624985
Epoch 12, Loss: 5.987423680961926
Epoch 13, Loss: 5.83794362151072
Epoch 14, Loss: 5.799936299036514
Epoch 15, Loss: 5.675452403379961
Epoch 16, Loss: 5.646823013771672
Epoch 17, Loss: 5.613846470437763
Epoch 18, Loss: 5.594630451247201
Epoch 19, Loss: 5.57845654409195
Epoch 20, Loss: 5.554716960097442
Epoch 21, Loss: 5.560510917669549
Epoch 22, Loss: 5.435388378361378
Epoch 23, Loss: 5.480636489139279
Epoch 24, Loss: 5.445733275745843
Epoch 25, Loss: 5.430715592279038
Epoch 26, Loss: 5.4004970661064755
Epoch 27, Loss: 5.418583080557612
Epoch 28, Loss: 5.3955162424731276
Epoch 29, Loss: 5.39752864613604
Epoch 30, Loss: 5.3524

Using 2554 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.


Epoch 98, Loss: 4.940258085867718
Epoch 99, Loss: 4.986977609322233
Epoch 100, Loss: 4.943641250359424


100%|██████████| 5/5 [00:15<00:00,  3.01s/it]


Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,...,Anomaly,SHAP_水温,SHAP_pH,SHAP_溶解氧,SHAP_高锰酸钾,SHAP_氨氮,SHAP_总磷,SHAP_总氮,SHAP_电导率,SHAP_浊度
251,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/2/14 12:00,劣Ⅴ,13.13,7.47,8.58,...,True,0.299327,-0.127139,0.068356,0.209077,0.165542,4.667963,-0.123944,0.404199,0.034007
851,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/5/17 20:00,Ⅲ,21.93,7.48,6.768,...,True,-0.213979,-0.069384,-0.159537,-0.394059,0.050253,-0.913643,-1.57501,-0.090543,4.31855
1103,浙江省,杭州市,东苕溪,太湖流域,奉口,2021/6/11 4:00,劣Ⅴ,27.41,7.45,1.085,...,True,-0.246855,-0.146798,-0.088174,-0.825255,-0.019494,5.439523,0.118696,-0.058191,-0.110991
1638,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/7/31 0:00,Ⅳ,24.53,7.85,6.992,...,True,-0.32817,-0.179866,-0.128884,-1.376971,0.61814,-2.688265,-6.700019,-0.089443,14.836305
1640,浙江省,杭州市,东苕溪,太湖流域,汪家埠,2021/7/31 4:00,Ⅱ,24.35,7.47,7.048,...,True,-0.229106,-0.089546,-0.154972,-0.124735,-0.176073,-0.189056,-0.610553,-0.136351,1.764921


In [6]:
# 加载 CSV 文件
data = pd.read_csv("../data/water/整理好的csv/杭州202101-202112/京杭运河202101-202112.csv")
# 处理缺失值
data.fillna(method='ffill', inplace=True)

# 删除重复行
data.drop_duplicates(inplace=True)
# 选择数值列（水温, pH, 溶解氧, 高锰酸钾, 氨氮, 总磷, 总氮, 电导率, 浊度）
numeric_columns = ['水温', 'pH', '溶解氧', '高锰酸钾', '氨氮', '总磷', '总氮', '电导率', '浊度']
data_numeric = data[numeric_columns]

# 数据标准化处理
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data_numeric)

# 转换为Pandas DataFrame格式
data_scaled_df = pd.DataFrame(data_scaled, columns=numeric_columns)

  data.fillna(method='ffill', inplace=True)


In [10]:
# 定义一个函数用于加载新数据并进行异常检测
def detect_anomalies_on_new_data(model, new_data, threshold=10):
    # 将新数据转化为Tensor
    new_data_tensor = torch.FloatTensor(new_data)
    
    # 使用训练好的模型进行推理
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(new_data_tensor)
        reconstruction_error = torch.mean((recon_data - new_data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    
    # 返回检测结果和重构误差
    return anomalies.numpy(), reconstruction_error.numpy()

# 加载新的数据集（确保已经预处理，且维度与原始数据集相同）
new_data_scaled_df = data_scaled_df  # 你需要加载和预处理新的数据

# 使用训练好的模型检测新数据集中的异常
anomalies_new, recon_error_new = detect_anomalies_on_new_data(vae_model, new_data_scaled_df.values)

# 将结果显示给用户
new_anomalies_df = new_data_scaled_df.copy()
new_anomalies_df['Reconstruction Error'] = recon_error_new
new_anomalies_df['Anomaly'] = anomalies_new

# 显示异常的行
data[new_anomalies_df['Anomaly'] == True]

Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,高锰酸钾,氨氮,总磷,总氮,电导率,浊度
497,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/2/17 0:00,Ⅱ,13.08,7.704,7.919,0.48,0.1324,0.0921,1.3141,451.834,2130.447
2440,浙江省,杭州市,京杭运河,太湖流域,五杭运河大桥,2021/8/10 16:00,Ⅲ,31.12,7.55,5.391,4.23,0.2464,0.1476,14.43,417.3,132.1
2797,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/9/9 20:00,Ⅲ,27.47,7.676,5.909,0.5,0.0261,0.1033,0.9688,204.095,2288.384
3302,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/10/26 8:00,Ⅱ,20.49,8.065,6.946,1.11,0.2312,0.0733,1.49,228.614,2218.954
3703,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/12/1 8:00,Ⅱ,14.81,7.773,9.378,1.82,0.1035,0.0519,2.4041,223.526,2149.161
3973,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/12/28 16:00,Ⅱ,10.39,7.734,10.558,3.25,0.2161,0.0579,2.3889,278.24,2071.373


In [9]:
import torch
import shap
import pandas as pd

# 定义一个函数用于加载新数据并进行异常检测
def detect_anomalies_on_new_data(model, new_data, threshold=10):
    # 将新数据转化为Tensor
    new_data_tensor = torch.FloatTensor(new_data)
    
    # 使用训练好的模型进行推理
    model.eval()
    with torch.no_grad():
        recon_data, _, _ = model(new_data_tensor)
        reconstruction_error = torch.mean((recon_data - new_data_tensor) ** 2, dim=1)
        anomalies = reconstruction_error > threshold
    
    # 返回检测结果和重构误差
    return anomalies.numpy(), reconstruction_error.numpy()

# 使用SHAP解释每个特征对新数据异常的贡献
def explain_anomalies_on_new_data(model, new_data, sample_data):
    # 定义解释器函数
    def model_predict(inputs):
        model.eval()
        with torch.no_grad():
            recon_data, _, _ = model(torch.FloatTensor(inputs))
            reconstruction_error = torch.mean((recon_data - torch.FloatTensor(inputs)) ** 2, dim=1)
        return reconstruction_error.numpy()

    # 创建一个KernelExplainer来计算SHAP值
    explainer = shap.KernelExplainer(model_predict, new_data)
    
    # 计算SHAP值
    shap_values = explainer.shap_values(sample_data)

    return shap_values

# 加载新的数据集（确保已经预处理，且维度与原始数据集相同）
new_data_scaled_df = data_scaled_df  # 假设你已经加载和预处理了新的数据

# 使用训练好的模型检测新数据集中的异常
anomalies_new, recon_error_new = detect_anomalies_on_new_data(vae_model, new_data_scaled_df.values)

# 将结果显示给用户
new_anomalies_df = pd.DataFrame(new_data_scaled_df.copy())
new_anomalies_df['Reconstruction Error'] = recon_error_new
new_anomalies_df['Anomaly'] = anomalies_new

# 筛选出新数据中的异常点
anomalous_new_data = new_data_scaled_df[anomalies_new]

# 解释新数据中异常点的SHAP值
shap_values_new = explain_anomalies_on_new_data(vae_model, new_data_scaled_df.values, anomalous_new_data.values)

# 创建一个DataFrame来保存新数据的SHAP值
shap_new_df = pd.DataFrame(shap_values_new, columns=new_data_scaled_df.columns)

# 将SHAP值添加到新数据的异常检测结果中
anomalous_new_df = new_anomalies_df[new_anomalies_df['Anomaly'] == True].copy()
for column in shap_new_df.columns:
    anomalous_new_df[f'SHAP_{column}'] = shap_new_df[column].values

anomalous_new_df

Using 4005 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
100%|██████████| 6/6 [00:27<00:00,  4.60s/it]


Unnamed: 0,水温,pH,溶解氧,高锰酸钾,氨氮,总磷,总氮,电导率,浊度,Reconstruction Error,Anomaly,SHAP_水温,SHAP_pH,SHAP_溶解氧,SHAP_高锰酸钾,SHAP_氨氮,SHAP_总磷,SHAP_总氮,SHAP_电导率,SHAP_浊度
497,-1.124435,0.302556,0.25527,-1.797709,-0.661679,-0.404049,-1.308395,0.15911,15.98376,26.259632,True,0.310811,-0.4571,-0.297837,1.507428,-0.440824,0.310846,3.911124,-0.293439,17.78459
2440,1.49153,-0.193827,-0.703598,1.23526,-0.353349,0.771516,14.646503,0.055613,0.146948,12.253405,True,0.120527,0.001433,0.173318,0.13494,0.053487,-0.018162,12.11743,0.00488,-0.042826
2797,0.962247,0.212304,-0.507121,-1.781533,-0.949183,-0.166818,-1.728437,-0.583358,17.235404,29.121778,True,-0.654604,-0.406306,-0.295279,1.724073,-0.581228,-0.053368,5.388359,-0.378167,19.607671
3302,-0.049917,1.466154,-0.113788,-1.28817,-0.394459,-0.802259,-1.094421,-0.509875,16.685174,18.516367,True,-0.107549,-1.207847,-0.134018,1.246078,-0.181789,1.150278,3.250179,-0.122771,16.778925
3703,-0.873569,0.524961,0.808668,-0.713928,-0.739843,-1.25554,0.01754,-0.525124,16.132067,15.724348,True,0.600673,-0.058156,0.34676,0.827448,0.064549,2.031725,-0.089912,0.220465,14.04889
3973,-1.51451,0.399253,1.256241,0.442644,-0.4353,-1.128451,-0.00095,-0.361147,15.515601,12.343762,True,0.836752,-0.193056,0.312433,-0.283932,-0.070443,1.42478,-0.281775,-0.05224,12.782494


In [12]:
data[new_anomalies_df['Anomaly'] == True]

Unnamed: 0,省份,城市,河流,流域,断面名称,监测时间,水质类别,水温,pH,溶解氧,高锰酸钾,氨氮,总磷,总氮,电导率,浊度
497,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/2/17 0:00,Ⅱ,13.08,7.704,7.919,0.48,0.1324,0.0921,1.3141,451.834,2130.447
2440,浙江省,杭州市,京杭运河,太湖流域,五杭运河大桥,2021/8/10 16:00,Ⅲ,31.12,7.55,5.391,4.23,0.2464,0.1476,14.43,417.3,132.1
2797,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/9/9 20:00,Ⅲ,27.47,7.676,5.909,0.5,0.0261,0.1033,0.9688,204.095,2288.384
3302,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/10/26 8:00,Ⅱ,20.49,8.065,6.946,1.11,0.2312,0.0733,1.49,228.614,2218.954
3703,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/12/1 8:00,Ⅱ,14.81,7.773,9.378,1.82,0.1035,0.0519,2.4041,223.526,2149.161
3973,浙江省,杭州市,京杭运河,太湖流域,顾家桥,2021/12/28 16:00,Ⅱ,10.39,7.734,10.558,3.25,0.2161,0.0579,2.3889,278.24,2071.373
