In [None]:
# This file is part of MARTRIX.
#
# MARTRIX is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# MARTRIX is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with MARTRIX. If not, see <https://www.gnu.org/licenses/>.
import pandas as pd
import numpy as np
import tqdm
import csv

import torch
from torch.nn import Linear
from torch_geometric.data import Data
from torch_geometric.nn import GCNConv, GATConv
from torch_geometric.explain import Explainer, GNNExplainer

import torch.nn.functional as F

from typing import Dict, List, Optional, Union
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt


In [None]:
feature_names = [
                "b","c","d", "Gender", "BLACK_AFRICAN_AMERICAN",
                                  "OTHER", "ASIAN", "pp_degree","pp_cls_coef",
                                  "pp_exposure_numerator","hh_degree"
                                  ,"education_center" , "assisted_living" ,
                                  "healthcare" ,   "charities_homeless" ,  "detention_center" ,
                                  "commercial_offices" ,  "government","hh_exposure_numerator" ,"hhnum_edu" ,"ppdeg_homeless","ppdeg_edu"]

print(len(feature_names))

22


In [None]:
folder = "../data/"
att_name = "att_pp_main_interaction.csv"
edge_list_name = ["edge_pp_main.csv", "edge_hh_main.csv","edge_pv_main_uniq.csv", "edge_pv_main_1.csv", "edge_pv_main_2.csv", "edge_pv_main_3.csv"
, "edge_pv_main_4.csv", "edge_pv_main_5.csv", "edge_pv_main_6.csv", "edge_pv_main_7.csv"]


In [None]:
att = pd.read_csv(folder + att_name)
att.columns=['NodeId', 'EventID', 'Age', 'Gender', 'pp_degree', 'pp_page_rank',
       'pp_cls_coef', 'pp_net_exp', 'pp_eigen', 'labels',
       'BLACK_AFRICAN_AMERICAN', 'OTHER', 'WHITE', 'ASIAN',
       'pv_binalized_degree', 'a', 'b', 'c', 'd', 'education_center',
       'assisted_living', 'healthcare', 'charities_homeless',
       'detention_center', 'commercial_offices', 'government', 'sum',
       'pp_exposure_numerator', 'hh_exposure_numerator', 'hh_degree',
       'hh_net_exp',"hhnum_edu" ,"ppdeg_homeless","ppdeg_edu"]
att['features'] = att.apply(lambda row: np.array([ row['Gender'], row['pp_degree'],  row['pp_cls_coef']
                                                  , row['BLACK_AFRICAN_AMERICAN'], row['OTHER']
                                                  , row['ASIAN'], row['b'], row['c'], row['d']
                                                  , row['education_center'], row['assisted_living'], row['healthcare']
                                                  , row['charities_homeless'], row['detention_center'], row['commercial_offices'], row['government']
                                                   , row['hhnum_edu'], row['ppdeg_homeless'], row['ppdeg_edu']
                                                  , row['pp_exposure_numerator'], row['hh_exposure_numerator'], row['hh_degree']]), axis=1)

edge_list = []
for name in edge_list_name:
      edge_list.append(pd.read_csv(folder + name))
for edge in edge_list:
  row_idx = []
  col_idx = []
  for i in range(edge.shape[0]):
        id_from = edge.iloc[i,0]
        id_to = edge.iloc[i,1]
        row_id = att.loc[:,"NodeId"][att["EventID"] == id_from]
        row_idx.append(row_id)
        col_id = att.loc[:,"NodeId"][att["EventID"] == id_to]
        col_idx.append(col_id)

In [None]:
positive_ids = att['NodeId'].loc[att['labels'] == 1]

positive_ids = positive_ids.values
type(positive_ids[0])

numpy.int64

In [None]:
len(att['features'][0])

22

In [None]:
source_node = np.concatenate((col_idx, row_idx))
target_node = np.concatenate((row_idx, col_idx))

In [None]:
train_share = 0.70
n_papers=2264
cut_off = int(n_papers * train_share)
print("train count:", cut_off)
train_mask = n_papers * [False]
train_mask[:cut_off] = cut_off * [True]
test_mask = [not e for e in train_mask]

print("test count:", test_mask.count(1))

train count: 1584
test count: 680


In [None]:
x = torch.tensor(
    att['features'].values.tolist(), dtype=torch.float
    )

y = torch.tensor(
    att['labels'].values.tolist(), dtype=torch.long
)

edge_index = torch.tensor(np.array([source_node, target_node]), dtype=torch.int64)

graph_object = Data(x=x, edge_index=edge_index, y=y)
graph_object.train_mask = torch.tensor(train_mask)
graph_object.test_mask = torch.tensor(test_mask)

  x = torch.tensor(


In [None]:
print(graph_object)
print("==============================================================")

print(f"Number of nodes: {graph_object.num_nodes}")
print(f"Number of edges: {int(graph_object.num_edges/2)}")
print(f"Average node degree: {(graph_object.num_edges) / graph_object.num_nodes:.2f}")
print(f"Number of training nodes: {graph_object.train_mask.sum()}")
print(f"Number of test nodes: {graph_object.test_mask.sum()}")
print(f"Training node label rate: {int(graph_object.train_mask.sum()) / graph_object.num_nodes:.2f}")
print(f"Test node label rate: {int(graph_object.test_mask.sum()) / graph_object.num_nodes:.2f}")
print(f"Contains isolated nodes: {graph_object.has_isolated_nodes()}")
print(f"Contains self-loops: {graph_object.has_self_loops()}")
print(f"Is undirected: {graph_object.is_undirected()}")

Data(x=[2264, 22], edge_index=[2, 296, 1], y=[2264], train_mask=[2264], test_mask=[2264])
Number of nodes: 2264
Number of edges: 0
Average node degree: 0.00
Number of training nodes: 1584
Number of test nodes: 680
Training node label rate: 0.70
Test node label rate: 0.30
Contains isolated nodes: True
Contains self-loops: False
Is undirected: False


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

X_train = X_train.numpy()
X_test = X_test.numpy()
y_train = y_train.numpy()

### GAT Training

In [None]:
# Define accuracy
def accuracy(pred_y, y):
    return (pred_y == y).sum() / len(y)

In [None]:
#GCN

device = 'cpu'

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.gcn = GCNConv(graph_object.num_node_features, 16)
        self.out = Linear(16, len(graph_object.y.unique()))

    def forward(self, x, edge_index):
        h = self.gcn(x, edge_index).relu()
        z = self.out(h)
        return h, z

model = GCN()
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


GCN(
  (gcn): GCNConv(22, 16)
  (out): Linear(in_features=16, out_features=2, bias=True)
)


In [None]:
## For GCN Training

criterion = torch.nn.CrossEntropyLoss()

# Data for animations
embeddings = []
losses = []
accuracies = []
outputs = []

# Training loop
for epoch in range(2400):
    # Clear gradients
    optimizer.zero_grad()

    # Forward pass
    h,z = model(graph_object.x, graph_object.edge_index)

    # Calculate loss function
    loss = criterion(z[graph_object.train_mask], graph_object.y[graph_object.train_mask])

    # Calculate accuracy
    acc = accuracy(z.argmax(dim=1), graph_object.y)

    # Compute gradients
    loss.backward()

    # Tune parameters
    optimizer.step()

    # Store data for animations
    embeddings.append(h)
    losses.append(loss)
    accuracies.append(acc)
    outputs.append(z.argmax(dim=1))

    # Print metrics every 10 epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch:>3} | Loss: {loss:.2f} | Acc: {acc*100:.2f}%')

Epoch   0 | Loss: 0.70 | Acc: 64.80%
Epoch  10 | Loss: 0.68 | Acc: 64.80%
Epoch  20 | Loss: 0.65 | Acc: 64.80%
Epoch  30 | Loss: 0.64 | Acc: 64.80%
Epoch  40 | Loss: 0.62 | Acc: 64.71%
Epoch  50 | Loss: 0.61 | Acc: 65.64%
Epoch  60 | Loss: 0.60 | Acc: 66.25%
Epoch  70 | Loss: 0.59 | Acc: 66.74%
Epoch  80 | Loss: 0.58 | Acc: 67.93%
Epoch  90 | Loss: 0.57 | Acc: 69.30%
Epoch 100 | Loss: 0.56 | Acc: 69.74%
Epoch 110 | Loss: 0.55 | Acc: 70.14%
Epoch 120 | Loss: 0.55 | Acc: 70.76%
Epoch 130 | Loss: 0.54 | Acc: 71.60%
Epoch 140 | Loss: 0.53 | Acc: 72.22%
Epoch 150 | Loss: 0.53 | Acc: 72.39%
Epoch 160 | Loss: 0.52 | Acc: 72.92%
Epoch 170 | Loss: 0.52 | Acc: 73.01%
Epoch 180 | Loss: 0.52 | Acc: 73.81%
Epoch 190 | Loss: 0.51 | Acc: 74.29%
Epoch 200 | Loss: 0.51 | Acc: 74.20%
Epoch 210 | Loss: 0.51 | Acc: 74.29%
Epoch 220 | Loss: 0.51 | Acc: 74.38%
Epoch 230 | Loss: 0.51 | Acc: 74.38%
Epoch 240 | Loss: 0.51 | Acc: 74.60%
Epoch 250 | Loss: 0.50 | Acc: 74.69%
Epoch 260 | Loss: 0.50 | Acc: 74.78%
E

In [None]:
## GAT

device = 'cpu'

class GAT(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, heads):
        super().__init__()
        self.conv1 = GATConv(in_channels, hidden_channels, heads, dropout=0.1)
        # On the Pubmed dataset, use `heads` output heads in `conv2`.
        # self.conv2 = GATConv(hidden_channels * heads, out_channels, heads=1,
        #                      concat=False, dropout=0.6)
        #self.out = Linear(hidden_channels, len(graph_object.y.unique()))

    def forward(self, x, edge_index):
        x = F.dropout(x, p=0.1, training=self.training)
        x = F.relu(self.conv1(x, edge_index))

        return x

model = GAT(graph_object.num_features, 64, len(graph_object.y.unique()),1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005, weight_decay=5e-5)

print(model)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

GAT(
  (conv1): GATConv(22, 64, heads=1)
)


In [None]:
## GAT test

#criterion = torch.nn.CrossEntropyLoss()

# Data for animations
embeddings = []
losses = []
accuracies = []
outputs = []


for epoch in range(2400):
    optimizer.zero_grad()

    # Forward pass
    out = model(graph_object.x, graph_object.edge_index)

    loss = F.cross_entropy(out[graph_object.train_mask], graph_object.y[graph_object.train_mask])

    # Calculate accuracy
    acc = accuracy(out.argmax(dim=1), graph_object.y)


    loss.backward()
    optimizer.step()

    losses.append(loss)
    accuracies.append(acc)


    if (epoch+1) % 10 == 0:
        print('Epoch: {}, Loss: {:.4f}, Training Acc: {:.4f}'.format(epoch+1, loss.item(), acc))



Epoch: 10, Loss: 4.0673, Training Acc: 0.1285
Epoch: 20, Loss: 3.6870, Training Acc: 0.4448
Epoch: 30, Loss: 3.3100, Training Acc: 0.5093
Epoch: 40, Loss: 2.9796, Training Acc: 0.4973
Epoch: 50, Loss: 2.7081, Training Acc: 0.4947
Epoch: 60, Loss: 2.4818, Training Acc: 0.4912
Epoch: 70, Loss: 2.2962, Training Acc: 0.5256
Epoch: 80, Loss: 2.1726, Training Acc: 0.5406
Epoch: 90, Loss: 2.0576, Training Acc: 0.5804
Epoch: 100, Loss: 1.9220, Training Acc: 0.5963
Epoch: 110, Loss: 1.8639, Training Acc: 0.6378
Epoch: 120, Loss: 1.7315, Training Acc: 0.6590
Epoch: 130, Loss: 1.6854, Training Acc: 0.6656
Epoch: 140, Loss: 1.5802, Training Acc: 0.6837
Epoch: 150, Loss: 1.5180, Training Acc: 0.6815
Epoch: 160, Loss: 1.4670, Training Acc: 0.6992
Epoch: 170, Loss: 1.3971, Training Acc: 0.7023
Epoch: 180, Loss: 1.3964, Training Acc: 0.7058
Epoch: 190, Loss: 1.3465, Training Acc: 0.7080
Epoch: 200, Loss: 1.2998, Training Acc: 0.7054
Epoch: 210, Loss: 1.2828, Training Acc: 0.7111
Epoch: 220, Loss: 1.27

In [None]:
# GNN sklearn Metrics
from sklearn.metrics import f1_score, accuracy_score, recall_score, average_precision_score, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
print('Accuracy: ', accuracy_score(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask]))

print('F1:', f1_score(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask]))

print('Recall:', recall_score(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask]))

print('AUPRC: ', average_precision_score(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask]))

print('AUC: ', roc_auc_score(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask]))
precisionn, recalll, _ = precision_recall_curve(graph_object.y[graph_object.test_mask], z.argmax(dim=1)[graph_object.test_mask])
pr_auc = metrics.auc(recalll, precisionn)
print('PRAUC: ', pr_auc)

false_alarm_rate = np.mean(z.argmax(dim=1)[graph_object.test_mask].numpy() == 1)
print('false_alarm_rate: ', false_alarm_rate)

Accuracy:  0.6779411764705883
F1: 0.574757281553398
Recall: 0.4713375796178344
AUPRC:  0.5911721832948409
AUC:  0.6632644182242177
PRAUC:  0.7258868173184284
false_alarm_rate:  0.29558823529411765


In [None]:
# GAT sklearn Metrics
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score, roc_auc_score

print('Accuracy: ', accuracy_score(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask]))

print('F1:', f1_score(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask]))

print('Recall:', recall_score(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask]))

print('AUPRC: ', average_precision_score(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask]))

print('AUC: ', roc_auc_score(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask]))
precisionn, recalll, _ = precision_recall_curve(graph_object.y[graph_object.test_mask], out.argmax(dim=1)[graph_object.test_mask])
pr_auc = metrics.auc(recalll, precisionn)
print('PRAUC: ', pr_auc)

false_alarm_rate = np.mean(out.argmax(dim=1)[graph_object.test_mask].numpy() == 1)
print('false_alarm_rate: ', false_alarm_rate)

Accuracy:  0.6558823529411765
F1: 0.4890829694323144
Recall: 0.35668789808917195
AUPRC:  0.5744827442654344
AUC:  0.6346281020500504
PRAUC:  0.7157622496981808
false_alarm_rate:  0.21176470588235294
