In [None]:
import torch
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
from torch_geometric.data import Dataset, Data
from torch_geometric.nn import GCNConv
import tables
import os
from tqdm import tqdm
from torch.nn import Linear
import copy
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, cohen_kappa_score

In [None]:
class SOGraphDataset(Dataset):
    def __init__(self, root, save_file, transform=None, pre_transform=None):
        r"""
        Initializing the dataset class
        
        Args:
            root (string): the path to 'raw/' folder and where the 'processed/' folder would be created
            save_file (string): the file name for storing the graph data object
        """
        self.class_size = 259258
        self.total_tags = 48374
        self.num_classes = 2
        self.save_file = save_file
        super(SOGraphDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        r"""
        Listing down the paths where the node embeddings and edges are stored
        """
        paths = ['training/features.h5','val/features.h5','test/features.h5','tags/all_tags.h5','edges.h5'] 
        return paths 

    @property
    def processed_file_names(self):
        r"""
        Returning the processed file name where the graph data object is stored
        """
        return self.save_file

    def download(self):
        r"""
        Needed in case graph data has to be downloaded
        """
        pass
    
    def get_tvs_split(self):
        r"""
        Splitting the labels into training, validation, and testing sets. Stratified split is used.
        """
        y = np.array([0]*self.class_size + [1]*10*self.class_size)
        trainval_y, test_y = train_test_split(y, test_size = 0.2, random_state=42, stratify=y)
        train_y, val_y = train_test_split(trainval_y, test_size = 0.25, random_state=42, stratify=trainval_y)
        return train_y, val_y, test_y
    
    def get_masks(self):
        r"""
        Making the training, validation, and testing masks for use during model training and evaluation
        """
        train_y, val_y, test_y = self.get_tvs_split()
        c1, c2, c3 = len(train_y), len(val_y), len(test_y) 
        total_nodes = self.class_size * 11 + self.total_tags
        train_mask = torch.zeros(total_nodes, dtype=torch.bool)
        train_mask[0:c1] = True
        val_mask = torch.zeros(total_nodes, dtype=torch.bool)
        val_mask[c1:c1+c2] = True
        test_mask = torch.zeros(total_nodes, dtype=torch.bool)
        test_mask[c1+c2:c1+c2+c3] = True
        return train_mask, val_mask, test_mask
    
    def get_features(self, filename, top_n=None):
        r"""
        Reading '.h5' files
        
        Args:
            filename (string): the '.h5' filename to be read
            top_n (integer): the number of rows to be loaded from the top, by default all rows are loaded
        """
        f = tables.open_file(filename, mode='r')
        if top_n is None:
            features = f.root.data[:]
        else:
            features = f.root.data[:top_n]
        f.close()
        return features
    
    def process(self):
        r"""
        Making the graph data object and saving it in the 'save_file' path specified while initialization
        """
        f1 = self.get_features(self.raw_paths[0])
        f2 = self.get_features(self.raw_paths[1])
        f3 = self.get_features(self.raw_paths[2])
        f4 = self.get_features(self.raw_paths[3])
        node_features = np.vstack((f1, f2, f3, f4))
        del f1, f2, f3, f4
        print("Node features loaded")
        
        edges = np.transpose(self.get_features(self.raw_paths[4]))
        print("Edges loaded")
        
        tag_labels = np.array([-1]*self.total_tags)
        train_y, val_y, test_y = self.get_tvs_split()
        labels = np.hstack((train_y, val_y, test_y, tag_labels))
        
        train_mask, val_mask, test_mask = self.get_masks() 
        
        data = Data(x = torch.tensor(node_features),
                    edge_index = torch.LongTensor(edges),
                    y = torch.tensor(labels))
        data.train_mask = train_mask
        data.val_mask = val_mask
        data.test_mask = test_mask
        
        print("Saving graph data object")
        torch.save(data, os.path.join(self.processed_dir, self.save_file))
    
    def len(self):
        r"""
        Returning the number of graphs that are made (here we make a single graph so we return 1)
        """
        return 1

    def get(self, idx):
        r"""
        Loading the graph data object that was stored in 'save_file' path specified while initialization
        """
        data = torch.load(os.path.join(self.processed_dir, self.save_file))
        return data

In [None]:
"""Initializing the path containing the 'raw/' folder, and where experiment results would be saved"""

path = "./"

In [None]:
"""Making the Stackoverflow graph dataset object"""

dataset = SOGraphDataset(root=path, save_file="graph_data.pt")

In [None]:
"""Loading the save graph data object and printing some graph details"""

data = dataset[0]
print(data)
print(f"Number of graphs: {len(dataset)}")
print(f"Number of features: {dataset.num_node_features}")
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Is undirected: {data.is_undirected()}")

In [None]:
class SOGCN(torch.nn.Module):
    def __init__(self, hidden_channels1, hidden_channels2, num_classes):
        r"""
        Initializing the graph neural network
        
        Args:
            hidden_channels1 (integer): the number of output channels after the first convolution layer
            hidden_channels2 (integer): the number of output channels after the second convolution layer
            num_classes (integer): the output dimension of the classification layer
        """
        super(SOGCN, self).__init__()
        torch.manual_seed(42)
        self.conv1 = GCNConv(dataset.num_features, hidden_channels1, improved=True)
        self.conv2 = GCNConv(hidden_channels1, hidden_channels2, improved=True)
        self.out = Linear(hidden_channels2, num_classes, bias=True)

    def forward(self, x, edge_index):
        r""" 
        Forward funtion to run on the input graph node features and edges
        
        Args:
            x (numpy.ndarray): the node features of the graph data object
            edge_index (numpy.ndarray): the edge index of the graph data object
        """
        x = self.conv1(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = self.conv2(x, edge_index)
        x = x.relu()
        x = F.dropout(x, p=0.5, training=self.training)

        x = F.softmax(self.out(x), dim=1)
        return x

In [None]:
"""Initializing model hyperparameters"""

hc1 = 64
hc2 = 64
learning_rate = 1e-2
decay = 0

In [None]:
"""Initializing the graph neural network object and displaying the number of model parameters"""

model = SOGCN(hidden_channels1 = hc1, hidden_channels2 = hc2, num_classes = dataset.num_classes)
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
# model = model.float().cuda()
model = model.float()
print(model)
print(f"Number of model parameters: {params}")

In [None]:
"""Initializing optimizer and loss function"""

optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate,
                             weight_decay=decay)

train_y, _, _ = dataset.get_tvs_split()
weights = len(train_y)/(2*np.bincount(train_y))
weights = torch.tensor(weights).float()
criterion = torch.nn.CrossEntropyLoss(weight=weights)

In [None]:
"""Initializing the input to the model"""

data_x = data.x.float().cuda()
data_edge = data.edge_index.cuda()

In [None]:
"""Initializing some more variables needed while training"""

train_losses = []
val_losses = [np.inf]
models_path = os.path.join(path, "models")
os.mkdir(models_path)
logs = open(f"{path}logs.txt","w")
epochs_done = 0
epochs = 500
eval_step = 10
model_step = 100
best_model = None

In [None]:
"""Training and evaluating the model"""

for epoch in tqdm(range(epochs_done, epochs_done+epochs)):
    model.train()
    optimizer.zero_grad()
    out = model(x=data_x, edge_index=data_edge).cpu().float()
    loss = criterion(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    logs.write(f"Epoch: {epoch}, Train Loss: {loss}\n")
    train_losses.append(loss)
    
    if epoch % eval_step == eval_step-1:
        model.eval()
        with torch.no_grad():
            pred = out.argmax(dim=1)
            val_loss = criterion(out[data.val_mask].detach(), data.y[data.val_mask])
            logs.write("Validation\n")
            logs.write(f"Val Loss: {val_loss}\n")
            logs.write(f"{classification_report(data.y[data.val_mask], pred[data.val_mask], digits=7)}\n\n")
        
        if float(val_loss)<=min(val_losses):
            best_model = copy.deepcopy(model)
            torch.save(model.state_dict(), f"{models_path}/model_{epoch+1}.pt")
        val_losses.append(float(val_loss))
    
    if epoch % model_step == model_step-1:
        torch.save(model.state_dict(), f"{models_path}/model_{epoch+1}.pt")

    del out
    gc.collect()

In [None]:
"""Testing the trained model"""

logs.write("\nTesting\n")
with torch.no_grad():
    model.eval()
    out = model(data_x, data_edge).cpu().float()
    test_pred = out.argmax(dim=1)[data.test_mask]
    test_loss = criterion(out[data.test_mask].detach(), data.y[data.test_mask])
    logs.write("Test Loss: {test_loss}\n")
    logs.write(f"{classification_report(data.y[data.test_mask], pred[data.test_mask], digits=7)}\n")
    _, _, test_y = dataset.get_tvs_split()
    logs.write(f"cohen kappa: {cohen_kappa_score(test_y, test_pred)}\n")
logs.close()