In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import Node2Vec
from torch_geometric.data import Data
from torch_geometric.nn import SAGEConv
from torch_geometric.nn import GCNConv
from torch_geometric.loader import NeighborLoader
from torch.utils.data import Dataset, DataLoader

from sklearn.svm import SVC
from sklearn.metrics import precision_score, accuracy_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
path_dataset = 'data/final_data/dataset_torch/'
path_model_out = 'data/final_data/baseline/'

## visualization

In [38]:
def output_metrics(X_test, y_test, model, model_name):
    from sklearn.metrics import classification_report
    from sklearn.metrics import confusion_matrix
    
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    print(f"Accuracy: {accuracy*100:.1f}")
    print(f"Precision: {precision*100:.1f}")
    print(f"Recall: {recall*100:.1f}")
    print(f"F1 Score: {f1*100:.1f}")
    
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)

    # result = {}
    # result['model'] = model_name
    # result['accuracy'] = accuracy
    # result['precision'] = precision
    # result['recall'] =recall
    # result['f1'] = f1
    # return result
    metrics_df = pd.DataFrame({
            'Dataset': [model_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1 Score': [f1]
        })
    
    return metrics_df

def plot_confusion_matrix(X_test, y_test, model):
    from sklearn.metrics import confusion_matrix
    from matplotlib.colors import Normalize

    # 预测并输出混淆矩阵 
    y_pred = model.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    row_sums = cm.sum(axis=1)[:, np.newaxis]
    cm_percentage = cm / row_sums * 100
    
    # 创建图像和轴
    fig, ax = plt.subplots(figsize=(8, 10))
    # 控制cbar
    norm = Normalize(vmin=0, vmax=100)
    im = ax.imshow(cm_percentage, interpolation='nearest', cmap=plt.cm.Blues, norm=norm)
    # 添加颜色条
    cbar = ax.figure.colorbar(im, ax=ax, fraction=0.046, pad=0.04)
    cbar.ax.set_ylabel('Percentage', rotation=-90, va="bottom")
    # 设置轴的标签和其他属性
    ax.set(
        xticks=np.arange(cm.shape[1]),
        yticks=np.arange(cm.shape[0]),
        xticklabels=np.unique(y_test), 
        yticklabels=np.unique(y_test),
        ylabel='True label',
        xlabel='Predicted label'
    )
    # 旋转x轴标签，使其可读
    plt.setp(ax.get_xticklabels(), ha="right", rotation_mode="anchor")
    # 在每个单元格中写入百分比值
    fmt = '.1f'  # 保留一位小数点的格式
    thresh = cm_percentage.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            text = '{:.1f}%'.format(cm_percentage[i, j])
            ax.text(j, i, text,
                    ha="center", va="center",
                    color="white" if cm_percentage[i, j] > thresh else "black")
            
    fig.tight_layout()
    plt.show()

## Data conversion

In [11]:
class GraphToMLPDataset(Dataset):
    def __init__(self, data, train_mask=None, val_mask=None):
        self.x = data.x
        self.y = data.y
        self.train_mask = train_mask
        self.val_mask = val_mask

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        if self.train_mask is not None and self.val_mask is not None:
            # 如果有mask，则根据mask返回训练或验证集的数据
            if self.train_mask[idx]:
                return self.x[idx], self.y[idx]
            elif self.val_mask[idx]:
                return self.x[idx], self.y[idx]
        else:
            # 如果没有mask，则返回所有数据
            return self.x[idx], self.y[idx]

def get_dataset(in_dataset, model_name=None):
    if model_name == 'node2vec':
        out_datdaset = Data(
            edge_index=in_dataset.edge_index,  
            edge_attr=in_dataset.edge_attr
        )
        return out_datdaset
    elif model_name == 'mlp':
        out_datdaset = Data(
            x = dataset.x,
            y = dataset.y, 
            train_mask = dataset.train_mask,
            val_mask = dataset.val_mask,
            num_classes = dataset.num_classes
        )
        return out_datdaset

## Node2vec

### Model

In [5]:
# 自定义Node2Vec以考虑边的权重
class WeightedNode2Vec(Node2Vec):
    def __init__(self, edge_index, edge_weight, *args, **kwargs):
        super().__init__(edge_index, *args, **kwargs)
        self.edge_weight = edge_weight

    def random_walk(self, batch, walk_length):
        row, col = self.adj_t.storage._row, self.adj_t.storage._col
        rowptr, col, weight = self.adj_t.csr()
        walk = torch.empty((batch.size(0), walk_length), dtype=torch.long, device=batch.device)
        walk[:, 0] = batch

        for i in range(1, walk_length):
            neighbors = rowptr[walk[:, i-1].repeat_interleave(rowptr[walk[:, i-1]+1] - rowptr[walk[:, i-1]])] + col[rowptr[walk[:, i-1]]]
            prob = self.edge_weight[rowptr[walk[:, i-1]].repeat_interleave(rowptr[walk[:, i-1]+1] - rowptr[walk[:, i-1]])]
            prob = prob / prob.sum(dim=-1, keepdim=True)
            walk[:, i] = neighbors[torch.multinomial(prob, 1).view(-1)]
        
        return walk

In [6]:
def train_node2vec(model, loader, epochs):
    for epoch in range(epochs+1):
        model.train()
        total_loss = 0
        for pos_rw, neg_rw in loader:
            optimizer.zero_grad()
            loss = model.loss(pos_rw.to(device), neg_rw.to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
    return total_loss / len(loader)

### Train

In [21]:
epochs = 50

for i,dataset_ in enumerate([dataset for dataset in os.listdir(path_dataset) if dataset[-5:] != '1.pth' and dataset.endswith('.pth')]):
    print(dataset_)
    dataset = torch.load(path_dataset+dataset_)
    dataset_node2vec = get_dataset(dataset, 'node2vec')

    node2vec = WeightedNode2Vec(
        dataset_node2vec.edge_index,
        dataset_node2vec.edge_attr,
        embedding_dim=128,
        walks_per_node=10,
        walk_length=80,
        context_size=10,
        p=1.0,
        q=1.0,
        num_negative_samples=1
    ).to(device)

    optimizer = torch.optim.Adam(node2vec.parameters(), lr=0.01)
    loader = node2vec.loader(batch_size=128, shuffle=True, num_workers=4)

    print('Train Node2vec...')
    train_node2vec(node2vec, loader, epochs)
    # torch.save(node2vec, path_model_out + dataset_.split('.')[0] + '_node2vec.pth')
    
    
    embeddings = node2vec()
    pd.DataFrame(embeddings.cpu().detach().numpy()).to_csv(path_model_out + dataset_.split('.')[0] + '_node2vec.csv', index=False)
    
    # train_embs = embeddings[dataset.train_mask].detach().cpu().numpy()
    # train_y = dataset.y[dataset.train_mask].cpu().numpy()
    
    # val_embs = embeddings[dataset.val_mask].detach().cpu().numpy()
    # val_y = dataset.y[dataset.val_mask].cpu().numpy()

shanghai_2018.pth
Train Node2vec...
shanghai_2019.pth
Train Node2vec...
suzhou_2018.pth
Train Node2vec...


### SVM

In [59]:
svc = SVC(kernel='rbf', C=0.05, gamma='scale').fit(train_embs, train_y)

In [22]:
csvs = [
    'data/final_data/baseline/shanghai_2018_node2vec.csv',
    'data/final_data/baseline/shanghai_2019_node2vec.csv',
    'data/final_data/baseline/suzhou_2018_node2vec.csv'
]
datasets = [
    'data/final_data/dataset_torch/shanghai_2018.pth',
    'data/final_data/dataset_torch/shanghai_2019.pth',
    'data/final_data/dataset_torch/suzhou_2018.pth'
]

In [58]:
dataset = torch.load(datasets[2])
df = pd.read_csv(csvs[2]).values

train_embs = df[dataset.train_mask]
train_y = dataset.y[dataset.train_mask].cpu().numpy()
val_embs = df[dataset.val_mask]
val_y = dataset.y[dataset.val_mask].cpu().numpy()

In [67]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', C=1, gamma='scale', decision_function_shape='ovo').fit(train_embs, train_y)
df = output_metrics(val_embs, val_y, svc, csvs[2].split('/')[-1][:-4])

Accuracy: 15.5
Precision: 14.7
Recall: 17.0
F1 Score: 13.2

Classification Report:
              precision    recall  f1-score   support

           0       0.20      0.43      0.28       368
           1       0.16      0.10      0.12       368
           2       0.07      0.23      0.10        88
           3       0.14      0.15      0.14       190
           4       0.37      0.12      0.18       773
           5       0.10      0.09      0.10       193
           6       0.10      0.04      0.06       379
           7       0.11      0.05      0.07       274
           8       0.08      0.31      0.12       130

    accuracy                           0.15      2763
   macro avg       0.15      0.17      0.13      2763
weighted avg       0.20      0.15      0.15      2763

Confusion Matrix:
 [[159  33  48  13  20  20  17  21  37]
 [132  38  43  21  27  18  16  33  40]
 [ 25   9  20   4   7   3   4   3  13]
 [ 41  14   9  29  16  19  16   3  43]
 [185  71  63  56  94  60  38  37 169

## MLP

### Model

In [17]:
class MLP(torch.nn.Module):
    def __init__(self, dim_in, dim_h, dim_out): 
        super().__init__() 
        self.linear1 = nn.Linear(dim_in, dim_h) 
        self.linear2 = Linear(dim_h, dim_h) 
        self.linear3 = Linear(dim_h, dim_out)

    def forward(self, x): 
        x = self.linear1(x) 
        x = F.relu(x) 
        x = self.linear2(x)
        x = F.relu(x) 
        x = self.linear3(x)
        return x

def train_mlp(model, data, criterion, optimizer, epochs, device):
    losses = []
    model.train()
    for epoch in range(epochs+1):
        data = data.to(device)
        optimizer.zero_grad()
        _ = model(data.x)

        out = F.log_softmax(_, dim=1)

        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        losses.append(loss.item())

        loss.backward()
        optimizer.step()
        
    return losses

def test_mlp(model, data, dataset_name):
    _ = model(data.x)
    out = F.log_softmax(_, dim=1)
    
    y_pred = out.argmax(dim=1)[data.val_mask].cpu()
    y_true = data.y[data.val_mask].cpu()
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    metrics_df = pd.DataFrame({
            'Dataset': [dataset_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1 Score': [f1]
        })
    
    return metrics_df

### Train

In [18]:
df_all = pd.DataFrame()
epochs = 2000

for i,dataset_ in enumerate([dataset for dataset in os.listdir(path_dataset) if dataset[-5:] != '1.pth' and dataset.endswith('.pth')]):
    print(dataset_)
    dataset = torch.load(path_dataset+dataset_)
    dataset_mlp = get_dataset(dataset, 'mlp')
    
    mlp = MLP(dataset_mlp.num_node_features, 256, dataset_mlp.num_classes).to(device)
    criterion = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(mlp.parameters(), lr=1e-3, weight_decay=5e-4)

    print('Train MLP...')
    loss = train_mlp(mlp, dataset_mlp, criterion, optimizer, epochs, device)
    df = test_mlp(mlp, dataset_mlp, dataset_.split('.')[0] + '_mlp')
    df_all = pd.concat([df_all, df])
    torch.save(mlp, path_model_out + dataset_.split('.')[0] + '_mlp.pth')

shanghai_2018.pth
Train MLP...
shanghai_2019.pth
Train MLP...
suzhou_2018.pth
Train MLP...


In [62]:
df_all.to_csv('data/final_data/other/evaluation_mlp.csv' ,index=False)

## GCN

### Model

In [3]:
class GCN(nn.Module):
    def __init__(self, dim_in, dim_h, dim_out, num_layers=5):
        super(GCN, self).__init__()
        # 网络层数
        self.num_layers = num_layers

        self.convs = nn.ModuleList()

        # 输入层
        self.convs.append(GCNConv(dim_in, dim_h))
        
        # 中间层
        for _ in range(num_layers - 2):
            self.convs.append(GCNConv(dim_h, dim_h))
        
        # 输出层
        self.convs.append(GCNConv(dim_h, dim_out))

        self.apply(self.weights_init)

    def weights_init(self, m):
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight.data)
            if m.bias is not None:
                m.bias.data.fill_(0.0)

    def forward(self, x, edge_index, edge_weight=None):
        for i in range(self.num_layers):
            if i == 0:
                h = self.convs[i](x, edge_index, edge_weight)
                h = F.relu(h)
                h = F.dropout(h, p=0.6, training=self.training)
                
            elif i != (self.num_layers - 1):
                h = self.convs[i](h, edge_index, edge_weight)
                h = F.relu(h)
                h = F.dropout(h, p=0.6, training=self.training)
                
            else:
                h = self.convs[i](h, edge_index, edge_weight)

        return h

### Train

In [8]:
def train(model, data, optimizer, criterion, save_model, epochs=200):
    train_loss = []
    val_loss = []
    val_accs = []
    max_acc = 0

    early_stopping_counter = 0
    min_val_loss = float('inf')

    model.train()
    for epoch in range(epochs+1):
        data = data.to(device)
        optimizer.zero_grad()
        _ = model(data.x, data.edge_index, data.edge_attr)
        out = F.log_softmax(_, dim=1)

        loss = criterion(out[data.train_mask], data.y[data.train_mask])
        train_loss.append(loss.item())

        loss.backward()
        optimizer.step()

        preds = out.argmax(dim=1)[data.train_mask].cpu()
        acc = accuracy_score(data.y[data.train_mask].cpu(), preds)

        f1 = f1_score(data.y[data.train_mask].cpu(), preds, average='macro')

        model.eval()
        with torch.no_grad():
            val_loss_ = criterion(out[data.val_mask], data.y[data.val_mask])
            val_loss.append(val_loss_.item())
            
            _ = model(data.x, data.edge_index, data.edge_attr)
            out = F.log_softmax(_, dim=1)
            
            val_acc = accuracy_score(data.y[data.val_mask].cpu(), out.argmax(dim=1)[data.val_mask].cpu())
            val_accs.append(val_acc)

            if val_loss_ > min_val_loss:
                early_stopping_counter += 1
                if early_stopping_counter >= 10:
                    print("早停机制")
                    break

        if epoch % 10 == 0:
            print(f'Epoch [{epoch:03d}/{epochs}], Train_Loss: {loss.item():0.3f}, Val_Loss: {val_loss[-1]:0.3f}, Val_acc: {max(val_accs):.3f}')
            val_accs = []

In [5]:
def test_gcn(model, data, dataset_name):
    model.eval()
    _ = model(data.x, data.edge_index, data.edge_attr)
    out = F.log_softmax(_, dim=1)
    
    y_pred = out.argmax(dim=1)[data.val_mask].cpu()
    y_true = data.y[data.val_mask].cpu()
    
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    f1 = f1_score(y_true, y_pred, average='macro')

    metrics_df = pd.DataFrame({
            'Dataset': [dataset_name],
            'Accuracy': [accuracy],
            'Precision': [precision],
            'Recall': [recall],
            'F1 Score': [f1]
        })
    
    return metrics_df

In [9]:
layers = [3,3,3]
dataset_name = [dataset for dataset in os.listdir(path_dataset) if dataset[-8:] != 'bert.pth' and dataset.endswith('.pth')]

i = 0
dataset_ = dataset_name[i]
 
print(dataset_)
dataset_gcn = torch.load(path_dataset+dataset_)
gcn = GCN(dataset_gcn.num_node_features, dataset_gcn.num_node_features, dataset_gcn.num_classes, layers[i]).to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(gcn.parameters(), lr=1e-3, weight_decay=5e-4)
train(gcn, dataset, optimizer, criterion, path_model_out+dataset_, epochs=250)

suzhou_2018.pth
Epoch [000/250], Train_Loss: 2.197, Val_Loss: 2.196, Val_acc: 0.256
Epoch [010/250], Train_Loss: 2.053, Val_Loss: 2.090, Val_acc: 0.339
Epoch [020/250], Train_Loss: 1.734, Val_Loss: 1.817, Val_acc: 0.450
Epoch [030/250], Train_Loss: 1.471, Val_Loss: 1.559, Val_acc: 0.529
Epoch [040/250], Train_Loss: 1.331, Val_Loss: 1.408, Val_acc: 0.574
Epoch [050/250], Train_Loss: 1.237, Val_Loss: 1.310, Val_acc: 0.607
Epoch [060/250], Train_Loss: 1.163, Val_Loss: 1.274, Val_acc: 0.638
Epoch [070/250], Train_Loss: 1.110, Val_Loss: 1.223, Val_acc: 0.650
Epoch [080/250], Train_Loss: 1.070, Val_Loss: 1.187, Val_acc: 0.674
Epoch [090/250], Train_Loss: 1.034, Val_Loss: 1.182, Val_acc: 0.676


In [None]:
# df_all = pd.DataFrame()

In [None]:
df = test_gcn(gcn, dataset_gcn, dataset_.split('.')[0] + '_mlp')
df_all = pd.concat([df_all,df])

In [None]:
df_all.to_csv('data/final_data/other/evaluation_gcn.csv' ,index=False)