In [1]:
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib
import torch
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
from encoder import *
from evaluate import *

In [2]:
class Dataset():
    def __init__(self, dataset, dtype):
        self.dataset = dataset
        self.dtype = dtype
        ef1 = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/edge_features.csv.gz", dtype=float, delimiter=',')
        e1 = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/edges.csv.gz", dtype=int, delimiter=',')
        gl1 = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/graph_labels.csv.gz", dtype=float, delimiter=',')
        nf1 = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/node_features.csv.gz", dtype=float, delimiter=',')
        num_nodes = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/num_nodes.csv.gz", dtype=int, delimiter=',')
        num_edges = np.loadtxt("dataset/dataset_"+str(self.dataset) + "/" + self.dtype + "/num_edges.csv.gz", dtype=int, delimiter=',')

        num_graphs = num_nodes.size
        # start = 125
        self.graphs = []
        self.data = []
        cnt = 0
        cnt0 = 0
        cnt1 = 0
        for i in range(0, num_graphs):
            num_node = num_nodes[i]
            num_edge = num_edges[i]
            edges = e1[np.sum(num_edges[:i]) : np.sum(num_edges[:i+1]), :]
            edge_features = torch.FloatTensor(ef1[np.sum(num_edges[:i]) : np.sum(num_edges[:i+1]), :])
            node_features = torch.FloatTensor(nf1[np.sum(num_nodes[:i]) : np.sum(num_nodes[:i+1]), :])
            node_data = dict((j, node_features[j, :]) for j in range(num_node))
            # print(node_data[0])
            
            if(np.isnan(gl1[i])):
                cnt+=1
                continue
            cnt0+=(gl1[i] == 0)
            cnt1+=(gl1[i] == 1)
            label = float(gl1[i])
            G = nx.Graph(y=label)
            label = torch.tensor(label)
            G.add_nodes_from([i for i in range(num_node)])
            for j, e in enumerate(edges):
                G.add_edge(e[0], e[1], edge_attr=edge_features[j])
            nx.set_node_attributes(G, node_data, name="X")
            edges = torch.tensor(list(edges))
            d = Data(edge_index=edges.T, x=node_features, edge_attr=edge_features, y = label)
            self.data.append(d)
            self.graphs.append(G)
            pos = nx.spring_layout(G)
            nx.draw_networkx(G, pos=pos, with_labels=True)
            nx.draw_networkx_edge_labels(G, pos=pos, font_size=5, clip_on=False)
            print(label)
            plt.show()
            break
        print("Number of nan: ", cnt)
        print("Number of 1: ", cnt1)
        print("Number of 0: ", cnt0)
    def get_data(self):
        return self.data
    def get_graphs(self):
        return self.graphs
dataset = Dataset(2, "train")

## Classification

In [454]:
# Visualize graph
# ---------- Code for drawing graph -----------
    # pos = nx.spring_layout(G)
    # nx.draw_networkx(G, pos=pos, with_labels=True)
    # nx.draw_networkx_edge_labels(G, pos=pos, font_size=5, clip_on=False)
    # print(label)
    # plt.show()
    # break

# print(gl1)

In [485]:
# Create a dataset
# batch size for mini-batch optimization
X_train = Dataset(2,"train")
X_val = Dataset(2,"valid")
X_train_dat = X_train.get_data()
X_val_dat = X_val.get_data()
batch_size = 128

train_loader = DataLoader(X_train_dat, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(X_val_dat,batch_size=batch_size, shuffle=True)
# print(loader)


Number of nan:  508
Number of 1:  1209
Number of 0:  5143
Number of nan:  70
Number of 1:  252
Number of 0:  536


In [431]:
print(len(val_loader))

1


In [475]:
from torch_geometric.nn import GCNConv, GATConv, GINConv, SAGEConv
from torch_geometric.nn.pool import global_mean_pool
import torch.nn.functional as F
import torch.nn.init as init
from torch.optim import lr_scheduler

In [496]:
# Basic GCN implementation

if torch.cuda.is_available():
    device = torch.device('cuda')
elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')


class GCN(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, emb_dim):
        super().__init__()
        self.conv1 = GATConv(emb_dim, hidden_channels, heads = 2)
        self.conv2 = GATConv(2*hidden_channels, hidden_channels, heads = 1)

        #NOTE: We cannot pass edge attr to GraphSage and GIN conv layers
        # self.conv1 = SAGEConv(emb_dim, hidden_channels, aggr='mean')  # 'mean' aggregation
        # self.conv2 = SAGEConv(hidden_channels, hidden_channels, aggr='mean')

        # self.conv1 = GINConv(torch.nn.Sequential(
        #     torch.nn.Linear(emb_dim, hidden_channels),
        #     torch.nn.ReLU(),
        #     torch.nn.Linear(hidden_channels, hidden_channels)
        # ))

        # self.conv2 = GINConv(torch.nn.Sequential(
        #     torch.nn.Linear(hidden_channels, hidden_channels),
        #     torch.nn.ReLU(),
        #     torch.nn.Linear(hidden_channels, hidden_channels)
        # ))

        self.edge_linear = torch.nn.Linear(emb_dim, 1)
        self.dropout = torch.nn.Dropout(p = 0.5)

        self.fc1 = torch.nn.Linear(hidden_channels, 64)
        self.bn1 = torch.nn.BatchNorm1d(64)

        self.fc2 = torch.nn.Linear(64, 32)
        self.bn2 = torch.nn.BatchNorm1d(32)

        self.fc3 = torch.nn.Linear(32, 16)
        self.bn3 = torch.nn.BatchNorm1d(16)

        self.fc4 = torch.nn.Linear(16, out_channels)
        self.node_encoder = NodeEncoder(emb_dim)
        self.edge_encoder = EdgeEncoder(emb_dim)

        #Initializations

        init.kaiming_uniform_(self.fc1.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc2.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc3.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc4.weight, mode='fan_in', nonlinearity='relu')
        torch.nn.init.xavier_uniform_(self.edge_linear.weight)

    def forward(self, x, edge_index, edge_attr, batch, size):
        # print(x.shape)
        x = self.node_encoder(x.to(torch.long))
        # print(x.shape)
        edge_attr = self.edge_encoder(edge_attr.to(torch.long))
        temp = self.edge_linear(edge_attr)
        temp = torch.relu(temp)
        # x = self.conv1(x, edge_index).relu()
        # x = self.conv2(x, edge_index).relu()
        x = self.conv1(x, edge_index, temp).relu()
        x = self.conv2(x, edge_index, temp).relu()
        x = global_mean_pool(x, batch)
        # print(x.shape)
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)
        x = self.dropout(x)
        


        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)
        x = self.dropout(x)
        

        x = self.fc3(x)
        x = self.bn3(x)
        x = torch.relu(x)
        x = self.dropout(x)

        x = self.fc4(x)
        
        # x = torch.relu(x)
        
        # print(x.shape)
        x = F.sigmoid(x)
        # print(x)
        # x = x[root_mask, :]
        return x

In [497]:
def calculate_accuracy(outputs, targets):
    predictions = (outputs > 0.5).float()  # Convert probabilities to binary predictions (0 or 1)
    correct_predictions = (predictions == targets).float()
    accuracy = correct_predictions.sum().item() / targets.shape[1]
    return accuracy


In [498]:
model = GCN(64, 1, 128)
# model, data = model.to(device), data.to(device)
# model = model.to(device)

# optimizer = torch.optim.Adam([
#     dict(params=model.conv1.parameters(), weight_decay=0),
#     dict(params=model.conv2.parameters(), weight_decay=0),
#     dict(params=model.edge_linear.parameters(), weight_decay=0)
# ],lr=0.001)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001, weight_decay=1e-3)
# scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [499]:
optimizer.zero_grad()
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    model.to(device)
    total_loss = 0.0
    total_correct_train = 0
    total_samples_train = 0
    for i, batch in enumerate(train_loader):
        # Move batch to device
        batch.to(device)
        weights = torch.ones_like(batch.y.reshape(1,-1))
        weights[batch.y.reshape(1,-1) == 1] = 2.0
        # print(weights)
        # break
        loss_fun = torch.nn.BCELoss(weight= weights)
        # Forward pass
        # print(len(batch))
        # print(batch.x.shape)
        # print(batch.edge_index.shape)
        # if(batch.edge_index.shape[0] == 0):
        #     continue
        # print(batch.batch)
        out = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
        # Calculate and print loss
        # print(batch.y.shape[0])
        optimizer.zero_grad()
        loss = loss_fun(out.reshape(1, -1), batch.y.reshape(1, -1))
        # print(float(loss))

        # Backward pass and optimization
        loss.mean().backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss+=loss.item()
        total_correct_train += calculate_accuracy(out.reshape(1, -1), batch.y.reshape(1, -1)) * batch.y.shape[0]
        total_samples_train += batch.y.shape[0]
    # scheduler.step()
    average_loss = total_loss / len(train_loader)
    accuracy_train = total_correct_train / total_samples_train

     # Validation phase
    model.eval()
    with torch.no_grad():
        total_bce = 0.0
        total_correct_val = 0
        total_samples_val = 0

        for i, batch in enumerate(val_loader):
             # Move batch to device
            batch.to(device)
            # Forward pass
            loss_fun = torch.nn.BCELoss()
            if(batch.edge_index.shape[0] == 0):
                continue
            out = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
            # print(out.shape)
            loss = loss_fun(out.reshape(1, -1), batch.y.reshape(1, -1))
            loss = loss.mean()

            total_bce += loss.item()
            total_correct_val += calculate_accuracy(out.reshape(1, -1), batch.y.reshape(1, -1)) * batch.y.shape[0]
            total_samples_val += batch.y.shape[0]

        average_bce_val = total_bce / len(val_loader)
        accuracy_val = total_correct_val / total_samples_val
    # Print logs
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {average_loss:.4f}, Train Accuracy: {accuracy_train:.4f}, Val BCE: {average_bce_val:.4f}, Val Accuracy: {accuracy_val:.4f}")
# Make sure to clear the computation graph after the loop
torch.cuda.empty_cache()


torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size([128, 64])
torch.Size

KeyboardInterrupt: 

### Evaluation of Model

In [494]:
# Load data from csv files
evaluator = Evaluator('dataset-2')
# benchmark = DataLoader(X_train_dat, batch_size= len(X_train_dat), shuffle = True)
val_loader = DataLoader(X_val_dat, batch_size= len(X_val_dat), shuffle = True)
for i, batch in enumerate(val_loader):
    model.eval()
    # Move batch to device
    batch.to(device)
    # Forward pass
    y_pred = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
    y_true = batch.y
    y_true = y_true.unsqueeze(1)
    # print(y_pred.shape)
    # print(y_true.shape)
    input_dict = {'y_true': y_true, 'y_pred': y_pred}
    result = evaluator.eval(input_dict)
    print(result)



{'rocauc': 0.7466240227434258}


## Regression

In [263]:
# Create a dataset
# batch size for mini-batch optimization
X_train = Dataset(1,"train")
X_val = Dataset(1,"valid")
X_train_dat = X_train.get_data()
X_val_dat = X_val.get_data()
batch_size = 128

train_loader = DataLoader(X_train_dat, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(X_val_dat,batch_size=len(X_val_dat), shuffle=True)
# print(loader)

2688
672


In [266]:
class GCN_Reg(torch.nn.Module):
    def __init__(self, hidden_channels, out_channels, emb_dim):
        super().__init__()
        self.conv1 = GATConv(emb_dim, hidden_channels, heads = 2)
        self.conv2 = GATConv(2*hidden_channels, hidden_channels, heads = 1)
        self.edge_linear = torch.nn.Linear(emb_dim, 1)

        self.fc1 = torch.nn.Linear(hidden_channels, 64)
        self.bn1 = torch.nn.BatchNorm1d(64)

        self.fc2 = torch.nn.Linear(64, 32)
        self.bn2 = torch.nn.BatchNorm1d(32)

        self.fc3 = torch.nn.Linear(32, 16)
        self.bn3 = torch.nn.BatchNorm1d(16)

        self.fc4 = torch.nn.Linear(16, out_channels)
        self.node_encoder = NodeEncoder(emb_dim)
        self.edge_encoder = EdgeEncoder(emb_dim)

        #Initializations

        init.kaiming_uniform_(self.fc1.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc2.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc3.weight, mode='fan_in', nonlinearity='relu')
        init.kaiming_uniform_(self.fc4.weight, mode='fan_in', nonlinearity='relu')
        torch.nn.init.xavier_uniform_(self.edge_linear.weight)

    def forward(self, x, edge_index, edge_attr, batch, size):
        x = self.node_encoder(x.to(torch.long))
        edge_attr = self.edge_encoder(edge_attr.to(torch.long))
        temp = self.edge_linear(edge_attr)
        temp = torch.relu(temp)
        x = self.conv1(x, edge_index, temp).relu()
        x = self.conv2(x, edge_index, temp).relu()
        x = global_mean_pool(x, batch)
        # print(x)
        x = self.fc1(x)
        x = self.bn1(x)
        x = torch.relu(x)

        x = self.fc2(x)
        x = self.bn2(x)
        x = torch.relu(x)

        x = self.fc3(x)
        x = self.bn3(x)
        x = torch.relu(x)

        x = self.fc4(x)
        # x = torch.relu(x)
        
        # print(x)
        # x = F.sigmoid(x)
        # print(x)
        # x = x[root_mask, :]
        return x

In [267]:
model = GCN_Reg(32, 1, 64)
optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
# scheduler = lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.5)

In [268]:
loss_fun = torch.nn.MSELoss()
optimizer.zero_grad()
num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    model.to(device)
    total_loss = 0.0
    for i, batch in enumerate(train_loader):
        # Move batch to device
        batch.to(device)
        # Forward pass
        out = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
        # Calculate and print loss
        # print(batch.y.shape[0])
        optimizer.zero_grad()
        loss = loss_fun(out.reshape(1, -1), batch.y.reshape(1, -1))
        # print(float(loss))

        # Backward pass and optimization
        loss.mean().backward(retain_graph=True)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        total_loss+=loss.item()
    # scheduler.step()
    average_loss = total_loss / len(train_loader)

     # Validation phase
    model.eval()
    with torch.no_grad():
        total_bce = 0.0
        
        for i, batch in enumerate(val_loader):
             # Move batch to device
            batch.to(device)
            # Forward pass
            out = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
            loss = loss_fun(out.reshape(1, -1), batch.y.reshape(1, -1))
            loss = loss.mean()

            total_bce += loss.item()

        average_bce_val = total_bce / len(val_loader)

    # Print logs
    print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {average_loss:.4f} , Val MSE: {average_bce_val:.4f}")
# Make sure to clear the computation graph after the loop
torch.cuda.empty_cache()


Epoch 1/100, Train Loss: 4.6300 , Val MSE: 5.3591
Epoch 2/100, Train Loss: 2.9222 , Val MSE: 3.3516
Epoch 3/100, Train Loss: 2.0877 , Val MSE: 2.6498
Epoch 4/100, Train Loss: 1.6307 , Val MSE: 1.6991
Epoch 5/100, Train Loss: 1.3570 , Val MSE: 1.2526
Epoch 6/100, Train Loss: 1.1775 , Val MSE: 1.7201
Epoch 7/100, Train Loss: 1.0411 , Val MSE: 1.3102
Epoch 8/100, Train Loss: 0.9479 , Val MSE: 0.9673
Epoch 9/100, Train Loss: 0.8849 , Val MSE: 1.1387
Epoch 10/100, Train Loss: 0.8453 , Val MSE: 0.9408
Epoch 11/100, Train Loss: 0.8155 , Val MSE: 1.1325
Epoch 12/100, Train Loss: 0.7694 , Val MSE: 0.9483
Epoch 13/100, Train Loss: 0.7648 , Val MSE: 1.0358
Epoch 14/100, Train Loss: 0.7171 , Val MSE: 0.8852
Epoch 15/100, Train Loss: 0.7095 , Val MSE: 1.8739
Epoch 16/100, Train Loss: 0.6777 , Val MSE: 0.8664
Epoch 17/100, Train Loss: 0.6515 , Val MSE: 0.7740
Epoch 18/100, Train Loss: 0.6379 , Val MSE: 0.9181
Epoch 19/100, Train Loss: 0.6362 , Val MSE: 1.4369
Epoch 20/100, Train Loss: 0.6411 , Val M

### Evaluation of Model

In [269]:
evaluator = Evaluator('dataset-1')
for i, batch in enumerate(val_loader):
    model.eval()
    # Move batch to device
    batch.to(device)
    # Forward pass
    y_pred = model(batch.x, batch.edge_index.to(torch.long), batch.edge_attr, batch.batch, batch.y.shape[0])
    y_true = batch.y
    y_true = y_true.unsqueeze(1)
    # print(y_pred.shape)
    # print(y_true.shape)
    input_dict = {'y_true': y_true, 'y_pred': y_pred}
    result = evaluator.eval(input_dict)
    print(result)



{'rmse': 0.8347033262252808}
