In [41]:
import pandas as pd
from torch.utils.data import Dataset
import torch
import os
import random
import numpy as np
from torch import nn
from typing import Dict, Optional, Tuple, List
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report

import dgl
from dgl.data import MiniGCDataset

In [42]:
import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

In [43]:
SEED = 42

torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

In [44]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [45]:
train_dev_path = "../data/tsv/train.tsv"
test_path = "../data/tsv/test.tsv"

test_df = pd.read_csv(test_path, sep='\t')

In [46]:
class BertSimpleClassifier(nn.Module):
    def __init__(self, bert_text_encoder, dropout=0.1):
        super().__init__()

        self.bert_text_encoder = bert_text_encoder
        self.dropout = nn.Dropout(p=dropout)
        bert_hidden_dim = bert_text_encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(bert_hidden_dim, bert_hidden_dim),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(bert_hidden_dim, 1),
        )


    def forward(self, inputs, attention_mask,):
        last_hidden_states = self.bert_text_encoder(inputs, attention_mask=attention_mask,
                                                    return_dict=True)['last_hidden_state']
        text_cls_embeddings = torch.stack([elem[0, :] for elem in last_hidden_states])
        proba = self.classifier(text_cls_embeddings)
        return proba

In [47]:
# model_name="roberta-base"
model_name="sentence-transformers/all-mpnet-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

freeze_embeddings = True
if freeze_embeddings:
    for param in bert_model.embeddings.parameters():
        param.requires_grad = False

freeze_layer_count = 5
if freeze_layer_count > 0:
    for layer in bert_model.encoder.layer[:freeze_layer_count]:
        for param in layer.parameters():
            param.requires_grad = False

print("# Trainable params: ", sum(p.numel() for p in bert_model.parameters() if p.requires_grad))



# Trainable params:  50206080


In [48]:
N_EPOCHS = 5
DROPOUT = 0.2

bert_simple_clf = BertSimpleClassifier(bert_model, dropout=DROPOUT).to(device)
optimizer = optim.Adam(bert_simple_clf.parameters(), lr=3e-5)
criterion = nn.BCEWithLogitsLoss().to(device)

In [49]:
bert_simple_clf_trained = BertSimpleClassifier(bert_model, dropout=DROPOUT).to(device)
bert_simple_clf_trained.load_state_dict(torch.load('./best-val-text_only_baseline.pt'))

<All keys matched successfully>

In [50]:
test_df.head()

Unnamed: 0,sample_id,question,questionEntity,answerEntity,questionEntityId,answerEntityId,graph
0,0,"After publishing A Time to Kill, which book di...",A Time to Kill,A Clash of Kings,Q1213715,Q300370,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
1,1,"After publishing A Time to Kill, which book di...",A Time to Kill,A Feast for Crows,Q1213715,Q1764445,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
2,2,"After publishing A Time to Kill, which book di...",A Time to Kill,Fear and Loathing in Las Vegas,Q1213715,Q772435,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
3,3,"After publishing A Time to Kill, which book di...",A Time to Kill,In Cold Blood,Q1213715,Q1142887,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
4,4,"After publishing A Time to Kill, which book di...",A Time to Kill,Into the Woods,Q1213715,Q1118244,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."


In [51]:
# test_df["label"] = np.zeros(shape=test_df.shape[0], dtype=np.float32)
test_df["graph"] = test_df["graph"].apply(eval)
test_df_filtered_emb = test_df.copy()

In [52]:
test_df_filtered_emb.head()

Unnamed: 0,sample_id,question,questionEntity,answerEntity,questionEntityId,answerEntityId,graph
0,0,"After publishing A Time to Kill, which book di...",A Time to Kill,A Clash of Kings,Q1213715,Q300370,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
1,1,"After publishing A Time to Kill, which book di...",A Time to Kill,A Feast for Crows,Q1213715,Q1764445,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
2,2,"After publishing A Time to Kill, which book di...",A Time to Kill,Fear and Loathing in Las Vegas,Q1213715,Q772435,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
3,3,"After publishing A Time to Kill, which book di...",A Time to Kill,In Cold Blood,Q1213715,Q1142887,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."
4,4,"After publishing A Time to Kill, which book di...",A Time to Kill,Into the Woods,Q1213715,Q1118244,"{'nodes': [{'type': 'INTERNAL', 'name_': 'Q30'..."


In [53]:
len(test_df_filtered_emb)

10961

In [54]:
# convert None in labels to [MASK]
def none2mask(row):
    nodes = row['nodes']
    edges = row['links']
    for node in nodes:
        if node['label'] is None:
            node['label'] = '[MASK]'
    for edge in edges:
        if edge['label'] is None:
            edge['label'] = '[MASK]'
    return row

test_df_filtered_emb['graph'] = test_df_filtered_emb['graph'].apply(none2mask)

# create columns with labels of nodes and links (add [EMPTY] to empty list)
def graph_labels(row, edges=False):
    labels = []
    if edges:
        data = row['links']
    else:
        data = row['nodes']
    for item in data:
        labels.append(item['label'])
    if len(labels)==0:
        labels.append('[EMPTY]')
    return labels


test_df_filtered_emb['node_labels'] = test_df_filtered_emb['graph'].apply(graph_labels)
test_df_filtered_emb['edge_labels'] = test_df_filtered_emb['graph'].apply(graph_labels, edges=True)     

In [55]:
# helper func 
def create_embs(model, tokenizer, data):
    data_token = tokenizer(data, max_length=128, padding="max_length", truncation="only_first", return_tensors="pt")
    
    model.eval()
    with torch.no_grad():
        input_ids = data_token["input_ids"].to(device)
        attention_mask = data_token["attention_mask"].to(device)
        outputs = model.bert_text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        outputs = outputs.last_hidden_state.mean(dim=1)
    return outputs


# create columns with embeddings in format suitable for dgl.data.CSVDataset
def label_embs(row):
    outputs = create_embs(bert_simple_clf_trained, tokenizer, row)
    outputs = outputs.cpu().detach().numpy()
    res = []
    for output in outputs:
        res.append(",".join(map(str, output)))
    return res

test_df_filtered_emb['node_embs'] = test_df_filtered_emb['node_labels'].apply(label_embs)
test_df_filtered_emb['edge_embs'] = test_df_filtered_emb['edge_labels'].apply(label_embs)


In [56]:
len(test_df_filtered_emb['sample_id'])

10961

In [57]:
# test_df_filtered_emb.iloc[-1,-3]
# функция для преобразования данных графа в csv


def graph2node_csv(graph, id_graph, node_embs, edge_embs, edges=False):
    output = []
    if edges:
        # one node cases (without edges) in order to avoid errors while forming dgl dataset
        if len(graph['links']) == 0:
            output.append(
                {
                    'graph_id': id_graph,
                    'src_id': None,
                    'dst_id': None,
                    'feat': ' '
                }
            )
        for i, item in enumerate(graph['links']):
            output.append(
                {
                    'graph_id': id_graph,
                    'src_id': item['source'],
                    'dst_id': item['target'],
                    'feat': edge_embs[i]
                }
            )
    else:
        for i, item in enumerate(graph['nodes']):
            output.append(
                {
                    'graph_id': id_graph,
                    'node_id': item['id'],
                    'feat': node_embs[i]
                }
            )
    return output


test_df_filtered_emb['node_csv'] = test_df_filtered_emb.apply(lambda x: graph2node_csv(x['graph'], x['sample_id'], x['node_embs'], x['edge_embs'], edges=False), axis=1)
test_df_filtered_emb['edge_csv'] = test_df_filtered_emb.apply(lambda x: graph2node_csv(x['graph'], x['sample_id'], x['node_embs'], x['edge_embs'], edges=True), axis=1)

In [58]:
lst_of_nodes = test_df_filtered_emb['node_csv'].tolist()
flattened_list_nodes = [item for sublist in lst_of_nodes for item in sublist]
node_df = pd.DataFrame.from_records(flattened_list_nodes)

lst_of_edges = test_df_filtered_emb['edge_csv'].tolist()
flattened_list_edges = [item for sublist in lst_of_edges for item in sublist]
edge_df = pd.DataFrame.from_records(flattened_list_edges)

In [60]:
# delete one noode graphs for this time 
# edge_df.dropna(how='any', axis=0, inplace=True)
node_df = node_df[node_df['graph_id'].isin(edge_df.graph_id.tolist())]
graph_df = test_df_filtered_emb.rename(columns={'sample_id':'graph_id'}).loc[:, ['graph_id']]
# graph_df = graph_df[graph_df['graph_id'].isin(edge_df.graph_id.tolist())]

In [61]:
edge_df.to_csv('../data/dataset_test/edges.csv', index=False, na_rep=' ')
node_df.to_csv('../data/dataset_test/nodes.csv', index=False)

# train_df_filtered_emb["label"] = train_df_filtered_emb["correct"].astype(np.float32)
# train_df_filtered_emb.rename(columns={'sample_id':'graph_id'}).loc[:, ['graph_id', 'label']].to_csv('../data/dataset/graphs.csv', index=False)

graph_df.to_csv('../data/dataset_test/graphs.csv', index=False)

In [62]:
edge_df.head()

Unnamed: 0,graph_id,src_id,dst_id,feat
0,0,0.0,0.0,"0.02993089,0.005517466,-0.108303435,0.16291112..."
1,0,1.0,0.0,"0.00097008544,-0.020878252,-0.071897194,-0.001..."
2,0,2.0,0.0,"0.00097008544,-0.020878252,-0.071897194,-0.001..."
3,1,0.0,0.0,"0.02993089,0.005517466,-0.108303435,0.16291112..."
4,1,1.0,0.0,"0.00097008544,-0.020878252,-0.071897194,-0.001..."


In [63]:
test_dataset = dgl.data.CSVDataset('../data/dataset_test/')

Done loading data from cached files.


In [64]:
import dgl
from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
from dgl import AddSelfLoop
from dgl.dataloading import GraphDataLoader
from torch.utils.data import DataLoader
from dgl.nn import EdgeGATConv, GraphConv
import torch.nn as nn
import torch
from tqdm import tqdm
from sklearn.metrics import f1_score

In [65]:
test_data = GraphDataLoader(test_dataset, batch_size=32, shuffle=False)

In [66]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [67]:
next(iter(test_data))[0]

Graph(num_nodes=86, num_edges=88,
      ndata_schemes={'feat': Scheme(shape=(768,), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(768,), dtype=torch.float32)})

In [68]:
graphs = dgl.unbatch(next(iter(test_data))[0])
len(graphs)

32

In [69]:
def collate(samples):
    # The input `samples` is a list of pairs
    #  (graph, label).
    graphs, labels = map(list, zip(*samples))
    batched_graph = dgl.batch(graphs)
    return batched_graph, torch.tensor(labels)

dataloader = DataLoader(test_data, batch_size=32, shuffle=True, collate_fn=collate)

In [70]:
import torch.nn.functional as F
g = next(iter(test_data))[0]

# g = dgl.add_self_loop(g)
# print(dgl.unbatch(g))
print(g.ndata['feat'].shape)
layer1 = EdgeGATConv(768, 768, 15, 2, allow_zero_in_degree=True)
layer2 = EdgeGATConv(15*2, 768, 15, 2, allow_zero_in_degree=True)
# layer1 = GraphConv(768, 15, allow_zero_in_degree=True)
# layer2 = GraphConv(15, 2, allow_zero_in_degree=True)
lin_layer = nn.Linear(15 * 2, 1)
edge_feat = g.edata['feat']
node_feat = g.ndata['feat']
print(edge_feat.shape)
out1 = layer1(g, node_feat, edge_feat, get_attention=False)
print(out1.shape)
print(out1.view(g.ndata['feat'].shape[0], 30).shape)
out2 = F.relu(layer2(g, out1.view(g.ndata['feat'].shape[0], 30), edge_feat))
print(out2.shape)
out2 = out2.view(g.ndata['feat'].shape[0], 30)
print(out2.shape)
# with g.local_scope():
#     g.ndata['feat'] = out2
#     hg1 = dgl.readout_nodes(g, 'feat')
#     print(hg1.shape)
g.ndata['feat'] = out2
hg = dgl.mean_nodes(g, 'feat')
print(hg.shape)
# he = dgl.mean_edges(g, 'feat')
# print(he.shape)
out3 = lin_layer(hg)
print(out3.shape)

# out1 = F.relu(layer1(g, g.ndata['feat']))
# print(out1.shape)
# out2 = F.relu(layer2(g, out1))
# print(out2.shape)
# g.ndata['h'] = out2
# # dgl.unbatch(g)
# dgl.mean_nodes(g, 'h').shape

torch.Size([86, 768])
torch.Size([88, 768])
torch.Size([86, 2, 15])
torch.Size([86, 30])
torch.Size([86, 2, 15])
torch.Size([86, 30])
torch.Size([32, 30])
torch.Size([32, 1])


In [71]:
import dgl.nn.pytorch as dglnn
import torch.nn as nn
import torch.nn.functional as F

class Classifier(nn.Module):
    def __init__(self, in_feats, edge_feats, out_feats, num_heads):
        super(Classifier, self).__init__()
        self.in_feats = in_feats
        self.edge_feats = edge_feats
        self.out_feats = out_feats
        self.num_heads = num_heads
        self.conv1 = dglnn.EdgeGATConv(in_feats, edge_feats, out_feats, num_heads, allow_zero_in_degree=True)
        # self.conv2 = dglnn.EdgeGATConv(out_feats * num_heads, edge_feats, out_feats, num_heads, allow_zero_in_degree=True)
        self.classify = nn.Linear(out_feats * num_heads, 1)

    def forward(self, g, node_feat, edge_feat):
        h = F.relu(self.conv1(g, node_feat, edge_feat))        
        h = h.view(h.shape[0], self.out_feats * self.num_heads)
        # h = F.relu(self.conv2(g, h, edge_feat))
        # h = h.view(h.shape[0], self.out_feats * self.num_heads)
        with g.local_scope():
            g.ndata['feat'] = h
            # Calculate graph representation by average readout.
            hg = dgl.mean_nodes(g, 'feat')
            # he = dgl.mean_edges(g, 'feat')
            # hg = torch.cat([hg, he], dim=-1)``
            return torch.sigmoid(self.classify(hg))
        

def f1_metric(logits, labels):
    preds = (logits > 0.5).float()
    return f1_score(labels.cpu().numpy(), preds.cpu().numpy())

In [72]:
in_feats_dim = 768
edge_feats_dim = 768
out_feats = 15
num_heads = 1
model = Classifier(in_feats_dim, edge_feats_dim, out_feats, num_heads)
model.load_state_dict(torch.load('GAT_model_10epochs_new.pth'))

<All keys matched successfully>

In [73]:
def predict(model, data_loader, device='cpu'):
    model.to(device)
    model.eval()

    
    pred_labels = []

    with torch.no_grad():
        for batched_graph, labels in data_loader:
            # Move the entire graph to the specified device
            batched_graph = batched_graph.to(device)

            # Extract node and edge features after moving the graph to the right device
            node_feat = batched_graph.ndata['feat']
            edge_feat = batched_graph.edata['feat']

            # Forward pass using the correct device
            logits = model(batched_graph, node_feat, edge_feat).squeeze(1)

            # Convert logits to numpy and make binary predictions
            pred_probas = logits.cpu().numpy()
            batch_pred_labels = (pred_probas >= 0.5) * 1

            pred_labels.extend(batch_pred_labels)
            

    return pred_labels


In [74]:
import pandas as pd

def predict_and_write_tsv(model, data_loader, device, output_path):
    # Make predictions
    pred_labels = predict(model, data_loader, device)

    # Assuming the data loader provides some form of sample identifier, you can build a list of IDs
    sample_ids = range(len(pred_labels))  # Replace with actual sample identifiers if available

    # Create a DataFrame to store predictions with sample IDs
    results = pd.DataFrame({
        "sample_id": sample_ids,
        "prediction": pred_labels
    })

    # Write to a TSV file
    results.to_csv(output_path, sep='\t', index=False)

# Example usage
output_path = "GAT_predictions_10epochs3.tsv"
predict_and_write_tsv(model, test_data, 'cpu', output_path)
