
* 目前思路：GNN等方法可作为规则挖掘方法的补充验证，另外可调研GNN可解释性算法（近期研究），用于后续补充解释判别原因

* 点的属性定义
  * initial_deposit(初始账户金额)、prior_sar_count(用户违约标签)
  * 根据边属性添加的属性：
    * 账户转入总金额
    * 账户转出总金额
    * 账户总转入次数
    * 账户总转出次数
    * 大额交易次数 （目前定义为总交易数据的前10%）
* 后续需要扩充的属性：时间维度（涉及到时序GNN的处理以及动态图的规则挖掘）
  * 需要讨论的问题
    * 是否需要将交易图按照交易时间窗口动态划分多个子图进行挖掘？（或者直接将交易时间作为边属性挖掘整个图）
    * 边标签与点标签的交互问题（是否直接将边标签作为边的一个属性特征：带边属性的GNN学习）


#### 输入属性的归一化

In [None]:
from sklearn.preprocessing import MinMaxScaler
import pandas as pd

#编号
index = 3

ac_df = pd.read_csv('data/updated_accounts.csv')

# 定义需要输入模型的属性
features_to_input = ['acct_id','prior_sar_count','initial_deposit','total_outgoing_amount', 'total_incoming_amount', 
                         'outgoing_transactions', 'incoming_transactions',
                         'high_value_outgoing_transactions', 'high_value_incoming_transactions']

# 定义需要归一化的列
features_to_normalize = ['initial_deposit','total_outgoing_amount', 'total_incoming_amount', 
                         'outgoing_transactions', 'incoming_transactions',
                         'high_value_outgoing_transactions', 'high_value_incoming_transactions']

# 仅选取需要归一化的列
ac_df_subset = ac_df[features_to_normalize]

# 初始化MinMaxScaler
scaler = MinMaxScaler()

# 归一化这些列
ac_df_normalized = pd.DataFrame(scaler.fit_transform(ac_df_subset), columns=features_to_normalize)

# 将归一化后的数据列加入到原始DataFrame中（替换原始列）
ac_df.update(ac_df_normalized)
# 将 'prior_sar' 转换为数值型标签，假设它是一个布尔型列
ac_df['prior_sar_count'] = ac_df['prior_sar_count'].astype(int)

ac_df_subset1 = ac_df[features_to_input]
# # 如果需要保存DataFrame到CSV文件
# ac_df_subset.to_csv('/mnt/data/normalized_accounts.csv', index=False)

# 显示归一化后的前几行数据进行检查
ac_df_subset1.head(20)


#### 构建geometric数据集

In [None]:
import torch
from torch_geometric.data import InMemoryDataset
from torch_geometric.data import Data
from tqdm import tqdm #进度条

# 提取特征列作为x
features = ac_df_subset1.drop(['acct_id', 'prior_sar_count'], axis=1)

x = torch.tensor(features.values, dtype=torch.float)

# 提取'prior_sar_count'列作为y
y = torch.tensor(ac_df_subset1['prior_sar_count'].values, dtype=torch.long)

print("First 10 x:", x[:10])
print("First 10 y:", y[:10])

trans_df = pd.read_csv('data/transactions_'+str(index)+'.csv')
source_nodes = trans_df.orig_acct
target_nodes = trans_df.bene_acct
edge_index = torch.tensor([source_nodes, target_nodes], dtype=torch.long)
print(edge_index)

data = Data(x=x, edge_index=edge_index, y=y)
print('data:',data)


#### 对比试验：用传统的全连接层(Multi-layer Perception Network)
##### 即点和点之间相互独立

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F


class MLP(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(12345)
        self.lin1 = Linear(7, hidden_channels)
        self.lin2 = Linear(hidden_channels, 2)

    def forward(self, x):
        x = self.lin1(x)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin2(x)
        return x

model = MLP(hidden_channels=16)
print(model)

In [None]:
from collections import Counter
import torch
from sklearn.metrics import f1_score, precision_score, recall_score
import matplotlib.pyplot as plt
import numpy as np

model = MLP(hidden_channels=16)

# 统计每个类别的样本数
labels = data.y
class_counts = Counter(labels.numpy())
# 输出样本类别数目
print(class_counts)
class_weights = {class_id: 1.0 / count for class_id, count in class_counts.items()}
# 将权重转换为张量
weights = torch.tensor([class_weights[i] for i in range(len(class_counts))], dtype=torch.float)
criterion = torch.nn.CrossEntropyLoss(weight=weights) #损失函数加入权重
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)  # Define optimizer.

# 计算分割点，90%用于训练
num_nodes = data.y.size(0) # 总节点数
train_split = int(num_nodes * 0.9) # 前90%的节点数

# 创建训练和测试掩码
train_mask = torch.zeros(num_nodes, dtype=torch.bool)
test_mask = torch.zeros(num_nodes, dtype=torch.bool)
train_mask[:train_split] = True
test_mask[train_split:] = True

def predict_with_threshold(out, threshold):
    # 对概率分布进行预测，若类别 1 的概率高于阈值则标记为正类
    return (out[:, 1] >= threshold).int()

def train():
    model.train()
    optimizer.zero_grad()  # Clear gradients.
    out = model(data.x[train_mask])  # Perform a single forward pass.
    loss = criterion(out, data.y[train_mask])  # Compute the loss solely based on the training nodes.
    loss.backward()  # Derive gradients.
    optimizer.step()  # Update parameters based on gradients.
    return loss

def test():
    model.eval()
    out = model(data.x[test_mask])
    print(out)
    probs = torch.softmax(out, dim=1)[:, 1]  # 计算类别为1的概率
    # pred = out.argmax(dim=1)  # 选择概率最高的作为分类类别
    pred = torch.softmax(out, dim=1)
    pred = predict_with_threshold(pred,0.4544)
    print(pred)
    count = torch.sum(out.argmax(dim=1) == 1).item()
    print(count)

    # F1值
    # 针对指定类别计算 F1 分数，需确保 `true_labels` 和 `pred` 中包含该类别
    target_class = 1
    f1 = f1_score(data.y[test_mask].cpu(), pred.cpu(), labels=[target_class], average='binary')
    precision = precision_score(data.y[test_mask].cpu(), pred.cpu(), labels=[target_class], average='binary')
    recall = recall_score(data.y[test_mask].cpu(), pred.cpu(), labels=[target_class], average='binary')
    print(f'Precision (Class {target_class}): {precision:.4f}')
    print(f'Recall (Class {target_class}): {recall:.4f}')
    print(f'F1 Score (Class {target_class}): {f1:.4f}')
    return f1,probs.detach().cpu().numpy()
    # #直接计算acc的方式
    # test_correct = pred == data.y[test_mask]  # Check against ground-truth labels.
    # test_acc = int(test_correct.sum()) / int(test_mask.sum())  # Derive ratio of correct predictions.
    # return test_acc

for epoch in range(1, 201):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:
all_probs = []  # 用于存储所有概率值
test_acc,probs = test()

# test_acc = test()

all_probs.extend(probs)
print(f'Test Accuracy: {test_acc:.4f}')

# 将概率值转换为 NumPy 数组
all_probs_np = np.array(all_probs)

# 找到第90百分位的概率(原数据异常占比大概6%)
percentile_90 = np.percentile(all_probs_np, 90)
print(f'90th Percentile Probability: {percentile_90:.4f}')

# 绘制概率分布图
plt.hist(all_probs_np, bins=30, alpha=0.75, color='blue')
plt.axvline(percentile_90, color='red', linestyle='dashed', linewidth=2, label='90th Percentile')
plt.title('Probability Distribution of Class 1')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.legend()
plt.show()

## Graph Neural Network (GNN)

将全连接层替换成GCN层
* 目前使用的GNN未考虑边的属性值（下一步加入边属性更新的GNN）
* 样本不平衡解决方法

In [None]:
from torch_geometric.nn import GCNConv


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        torch.manual_seed(1234567)
        self.conv1 = GCNConv(7, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, 2)

    # 在GCN过程中图结构即edge_index没有发生改变
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCN(hidden_channels=16)
print(model)

In [None]:

model = GCN(hidden_channels=16)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss(weight=weights) #加入权重


def count_substrings_in_file(file_path, substring):
    try:
        with open(file_path, 'r') as file:
            content = file.read()
        
        count = content.count(substring)
        
        return count
    except FileNotFoundError:
        return "File not found."
    except Exception as e:
        return str(e)


def save_model_txt(model, scale_factor, filename):
    """
    将PyTorch模型的参数乘以指定的因子并保存到txt文件中。
    
    参数:
    model (torch.nn.Module): PyTorch模型。
    scale_factor (float): 乘法因子。
    filename (str): 保存的文件名。
    """
    # 模型参数
    params = model.state_dict()

    # 将所有参数展开成一维向量并乘以较大的数
    all_params = []
    for value in params.values():
        flattened_value =(value.view(-1).numpy() * scale_factor)  # 将参数展开成一维向量并乘以较大的数
        
        rounded_value = np.round(flattened_value).astype(int)  # 取整
        all_params.extend(rounded_value)
        
        # all_params.extend(flattened_value)

    all_params_number = len(all_params)
    
    
    # 将所有参数保存到txt文件
    with open(filename, 'w') as f:
        f.write(str(all_params_number)+'\n')
        f.write(' '.join(map(str, all_params)) + '\n')

def train():
    model.train()
    optimizer.zero_grad()  
    out = model(data.x, data.edge_index)  
    loss = criterion(out, data.y)  
    loss.backward() 
    optimizer.step()
    save_model_txt(model, scale_factor=1e10, filename='GCN_params'+'_'+str(index)+'.txt')  
    return loss

def test():
    model.eval()
    out = model(data.x, data.edge_index)
    probs = torch.softmax(out, dim=1)[:, 1]  # 计算类别为1的概率
    # pred = out.argmax(dim=1)
    pred = torch.softmax(out, dim=1)
    pred = predict_with_threshold(pred,0.62)
    # count = torch.sum(out.argmax(dim=1) == 1).item()
    # print(count)
    
    # F1值
    # 针对指定类别计算 F1 分数，需确保 `true_labels` 和 `pred` 中包含该类别
    target_class = 1
    f1 = f1_score(data.y.cpu(), pred.cpu(), labels=[target_class], average='binary')
    precision = precision_score(data.y.cpu(), pred.cpu(), labels=[target_class], average='binary')
    recall = recall_score(data.y.cpu(), pred.cpu(), labels=[target_class], average='binary')
    print(f'Precision (Class {target_class}): {precision:.4f}')
    print(f'Recall (Class {target_class}): {recall:.4f}')
    print(f'F1 Score (Class {target_class}): {f1:.4f}')
    return f1,probs.detach().cpu().numpy()

    # test_correct = pred == data.y 
    # test_acc = int(test_correct.sum()) / int(data.x.size(0))  
    # return test_acc


for epoch in range(1, 1001):
    loss = train()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

In [None]:

all_probs = []  # 用于存储所有概率值
test_acc,probs = test()
# test_acc = test()

all_probs.extend(probs)
print(f'Test Accuracy: {test_acc:.4f}')

# 将概率值转换为 NumPy 数组
all_probs_np = np.array(all_probs)

# 找到第90百分位的概率(原数据异常占比大概6%)
percentile_90 = np.percentile(all_probs_np, 90)
print(f'90th Percentile Probability: {percentile_90:.4f}')

# 绘制概率分布图
plt.hist(all_probs_np, bins=30, alpha=0.75, color='blue')
plt.axvline(percentile_90, color='red', linestyle='dashed', linewidth=2, label='90th Percentile')
plt.title('Probability Distribution of Class 1')
plt.xlabel('Probability')
plt.ylabel('Frequency')
plt.legend()
plt.show()

#### 存在问题，样本不均衡导致判别过程的问题？