In [1]:
# import torch
# torch.__version__
# torch.version.cuda

In [2]:
# ! pip install torch==1.7.0+cu110 torchvision==0.8.1+cu110 torchaudio===0.7.0 -f https://download.pytorch.org/whl/torch_stable.html
# ! pip install --upgrade --force-reinstall torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu110.html
# ! pip install --upgrade --force-reinstall torch-sparse -f https://pytorch-geometric.com/whl/torch-1.7.0+cu110.html
# ! pip install --upgrade --force-reinstall torch-cluster -f https://pytorch-geometric.com/whl/torch-1.7.0+cu110.html
# ! pip install --upgrade --force-reinstall torch-spline-conv -f https://pytorch-geometric.com/whl/torch-1.7.0+cu110.html
# ! pip install torch-geometric
# ! pip install numpy==1.18.0

In [3]:
import os
import time
import pickle
import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
from torch.nn import Sequential as Seq, Linear, ReLU

from tqdm.notebook import tqdm
from torch_geometric.data import Data

from sklearn.metrics import classification_report, f1_score

In [4]:
from torch_geometric.nn import MessagePassing
from torch_geometric.nn import GraphConv, TopKPooling, GatedGraphConv
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp

from torch_geometric.data import DataLoader
from torch_geometric.data import InMemoryDataset

from torch_geometric.utils import remove_self_loops, add_self_loops

In [5]:
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"

# Dataset

In [9]:
conll_data = pickle.load(open('conll_graph_all.pickle', 'rb'))
vocabulary = pickle.load(open('vocabulary_all.pickle', 'rb'))
voc2id = {key:{l: i for i, l in enumerate(vocabulary[key])} for key in vocabulary}
labels = pickle.load(open('labels.pickle', 'rb'))

In [8]:
"""class CoNLLDataset(Dataset):
    def __init__(self, data, split, voc2id, labels, window_size=3, root='.', transform=None, pre_transform=None):
        self.dataset = data[split][10:]
        self.voc2id = voc2id
        self.labels = labels
        self.label2id = {l: float(i) for i, l in enumerate(labels)}
        
        self.window_size = window_size

        super(CoNLLDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

        
    @property
    def raw_file_names(self):
        return []
    
    @property
    def processed_file_names(self):
        return ['/data/graphner_embeddings/preprocessed_graph_dataset/test1.dataset']

    def download(self):
        pass
    
    def process(self):
        data_list = []        
        print(self.labels)
        for doc_i, doc in enumerate(tqdm(self.dataset)):
            features = ['word', 'pos', 'chunk', 'classes', 'extra'] # to keep the same order
            one_hot  = {key:np.zeros(len(self.voc2id[key])) for key in features}
            for key in one_hot:
                for v in doc[key]:
                    one_hot[key][voc2id[key][v]] = 1.

            node = np.concatenate([one_hot[key] for key in features])

            nodes = [node]
            edges = []

            window_size = max(len(doc['left_context']), len(doc['right_context'])) if self.window_size == 'all' \
                        else self.window_size

            for window  in [doc['left_context'][-window_size:], doc['right_context'][:window_size]]:
                for i, w in enumerate(window):
                    neighbor = np.zeros(node.shape)
                    neighbor[self.voc2id['word'][w]] = 1.
                    edges.append((len(nodes), 0 if i == 0 else (len(nodes) - 1)))
                    nodes.append(neighbor)

            x = torch.FloatTensor(nodes)
            y = torch.FloatTensor(self.label2id[doc['label']])
            edge_index = torch.tensor(list(zip(*edges)))
            
            data = Data(x=x, y=y, edge_index=edge_index)
            data_list.append(data)
        
            if doc_i == 99:
                break"""
print('')




In [11]:
class CoNLLDataset(InMemoryDataset):
    def __init__(self, data, split, voc2id, labels, window_size=3, root='.', transform=None, pre_transform=None):
        self.dataset = data[split][10:]
        self.voc2id = voc2id
        self.labels = labels
        self.label2id = {l: i for i, l in enumerate(labels)}
        
        self.window_size = window_size

        super(CoNLLDataset, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_paths[0])

        
    @property
    def raw_file_names(self):
        return []
    
    @property
    def processed_file_names(self):
        return ['/data/graphner_embeddings/preprocessed_graph_dataset/test1.dataset']

    def download(self):
        pass
    
    def process(self):
        data_list = []        
    
        for doc_i, doc in enumerate(tqdm(self.dataset)):
            features = ['word', 'pos', 'chunk', 'classes', 'extra'] # to keep the same order
            one_hot  = {key:np.zeros(len(self.voc2id[key])) for key in features}
            for key in one_hot:
                for v in doc[key]:
                    one_hot[key][voc2id[key][v]] = 1.

            node = np.concatenate([one_hot[key] for key in features])

            nodes = [node]
            edges = []

            window_size = max(len(doc['left_context']), len(doc['right_context'])) if self.window_size == 'all' \
                        else self.window_size

            for window  in [doc['left_context'][-window_size:], doc['right_context'][:window_size]]:
                for i, w in enumerate(window):
                    neighbor = np.zeros(node.shape)
                    neighbor[self.voc2id['word'][w]] = 1.
                    edges.append((len(nodes), 0 if i == 0 else (len(nodes) - 1)))
                    nodes.append(neighbor)

            x = torch.FloatTensor(nodes)
            y = torch.FloatTensor(self.label2id[doc['label']])
            
            if doc_i == 0:
                print(y)
            edge_index = torch.tensor(list(zip(*edges)))
            
            data = Data(x=x, y=y, edge_index=edge_index)
            data_list.append(data)
        
            if doc_i == 99:
                break
        
        data, slices = self.collate(data_list)
        torch.save((data, slices), self.processed_paths[0])

In [12]:
train_dataset = CoNLLDataset(data=conll_data, split='train', voc2id=voc2id, labels=labels, window_size=3)
val_dataset = CoNLLDataset(data=conll_data, split='validation', voc2id=voc2id, labels=labels, window_size=3)
test_dataset = CoNLLDataset(data=conll_data, split='test', voc2id=voc2id, labels=labels, window_size=3)

In [13]:
train_loader = DataLoader(train_dataset, batch_size=10)
val_loader = DataLoader(val_dataset, batch_size=10)
test_loader = DataLoader(test_dataset, batch_size=10)

In [14]:
for x in train_loader:
    print(x)
    break

Batch(batch=[48], edge_index=[2, 38], x=[48, 19046], y=[23])


In [15]:
x.y

tensor([ 8.3112e+20,  3.3064e-09,  1.6917e-04,  3.9661e+30,  4.5703e-41,
         7.7140e+31,  6.1952e-04,  1.1434e+27,  2.8666e+32,  4.5437e+30,
         3.9666e+30,  4.5703e-41,         nan,  5.1026e+36,  2.3180e-10,
         7.3475e-40,         nan,         nan,         nan, -1.9563e-09,
         1.6338e-09,  2.2582e+30,  4.5703e-41])

# Model

In [71]:
class SAGEConv(MessagePassing):
    def __init__(self, in_channels, out_channels):
        super(SAGEConv, self).__init__(aggr='max') #  "Max" aggregation.
        self.lin = torch.nn.Linear(in_channels, out_channels)
        self.act = torch.nn.ReLU()
        self.update_lin = torch.nn.Linear(in_channels + out_channels, in_channels, bias=False)
        self.update_act = torch.nn.ReLU()
        
    def forward(self, x, edge_index):
        # x has shape [N, in_channels]
        # edge_index has shape [2, E]
        
        
        edge_index, _ = remove_self_loops(edge_index)
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        
        
        return self.propagate(edge_index, size=(x.size(0), x.size(0)), x=x)

    def message(self, x_j):
        # x_j has shape [E, in_channels]

        x_j = self.lin(x_j)
        x_j = self.act(x_j)
        
        return x_j

    def update(self, aggr_out, x):
        # aggr_out has shape [N, out_channels]

        new_embedding = torch.cat([aggr_out, x], dim=1)
        
        new_embedding = self.update_lin(new_embedding)
        new_embedding = self.update_act(new_embedding)
        
        return new_embedding

In [72]:
embed_dim = 128
dim_input = 19046

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()

        self.conv1 = SAGEConv(embed_dim, 128)
        self.pool1 = TopKPooling(128, ratio=0.8)
        self.conv2 = SAGEConv(128, 128)
        self.pool2 = TopKPooling(128, ratio=0.8)
        self.conv3 = SAGEConv(128, 128)
        self.pool3 = TopKPooling(128, ratio=0.8)
        # self.item_embedding = torch.nn.Embedding(num_embeddings=dim_input, embedding_dim=embed_dim)
        self.lin0 = torch.nn.Linear(dim_input, embed_dim)
        self.lin1 = torch.nn.Linear(256, 128)
        self.lin2 = torch.nn.Linear(128, 64)
        self.lin3 = torch.nn.Linear(64, 5)
        self.bn1 = torch.nn.BatchNorm1d(128)
        self.bn2 = torch.nn.BatchNorm1d(64)
        self.act1 = torch.nn.ReLU()
        self.act2 = torch.nn.ReLU()        
  
    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = self.lin0(x.float())
        # x = x.squeeze(1)        

        x = F.relu(self.conv1(x, edge_index))
        
        z = self.pool1(x, edge_index, None, batch)
        x, edge_index, _, batch, _, _ = z
        x1 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv2(x, edge_index))
     
        x, edge_index, _, batch, _, _ = self.pool2(x, edge_index, None, batch)
        x2 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = F.relu(self.conv3(x, edge_index))

        x, edge_index, _, batch, _, _ = self.pool3(x, edge_index, None, batch)
        x3 = torch.cat([gmp(x, batch), gap(x, batch)], dim=1)

        x = x1 + x2 + x3

        x = self.lin1(x)
        x = self.act1(x)
        x = self.lin2(x)
        x = self.act2(x)      
        x = F.dropout(x, p=0.5, training=self.training)

        x = torch.sigmoid(self.lin3(x)).squeeze(1)
        print(x)
        
        return x

In [73]:
device = torch.device('cuda')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_fn = torch.nn.BCELoss()

In [74]:
torch.cuda.device_count()

1

In [75]:
def train():
    model.train()

    loss_all = 0
    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data)
        label = data.y.to(device)
        loss = loss_fn(output, label)
        loss.backward()
        loss_all += data.num_graphs * loss.item()
        optimizer.step()
    return loss_all / len(train_dataset)

In [76]:
def evaluate(loader, split=''):
    model.eval()

    predictions = []
    labels = []

    with torch.no_grad():
        for data in loader:

            data = data.to(device)
            pred = model(data).detach().cpu().numpy()

            label = data.y.detach().cpu().numpy()
            predictions.append(pred)
            labels.append(label)

    predictions = np.hstack(predictions)
    labels = np.hstack(labels)
    
    print('==== Evaluation on the', split.upper(), ' set ====')
    print(classification_report(predictions, labels, digits=4))
    
    micro_F1 = metrics.f1_score(all_gt, all_out, average='micro')
    macro_F1 = metrics.f1_score(all_gt, all_out, average='macro')
    weighted_F1 = metrics.f1_score(all_gt, all_out, average='weighted')

        
    return (micro_F1, macro_F1, weighted_F1)

In [77]:
for epoch in range(1):
    loss = train()
    train_acc = evaluate(train_loader, 'train')
    val_acc = evaluate(val_loader, 'val')    
    test_acc = evaluate(test_loader, 'test')
    print(f'Epoch: {epoch:03d}, Loss: {loss:.5f}')

tensor([[0.4743, 0.4915, 0.5227, 0.5125, 0.4760]], device='cuda:0',
       grad_fn=<SqueezeBackward1>)


ValueError: Using a target size (torch.Size([3])) that is different to the input size (torch.Size([1, 5])) is deprecated. Please ensure they have the same size.