In [19]:
import pandas as pd
from torch.utils.data import Dataset
import torch
import os
import random
import numpy as np
from torch import nn
from typing import Dict, Optional, Tuple, List
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, random_split
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import precision_score, f1_score, recall_score, classification_report

import dgl
from dgl.data import MiniGCDataset

In [20]:
SEED = 42

torch.manual_seed(SEED)
torch.random.manual_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.cuda.random.manual_seed(SEED)
torch.cuda.random.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True

In [21]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [22]:
train_dev_path = "../data/tsv/train.tsv"
test_path = "../data/tsv/test.tsv"

train_dev_df = pd.read_csv(train_dev_path, sep='\t')
test_df = pd.read_csv(test_path, sep='\t')

In [23]:
train_dev_questions = list(train_dev_df["question"].unique())
test_questions = list(test_df["question"].unique())
num_train_dev_questions = len(train_dev_questions)
random.shuffle(train_dev_questions)

In [24]:
import time
import math
import matplotlib
matplotlib.rcParams.update({'figure.figsize': (16, 12), 'font.size': 14})
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.display import clear_output

In [25]:
class BertSimpleClassifier(nn.Module):
    def __init__(self, bert_text_encoder, dropout=0.1):
        super().__init__()

        self.bert_text_encoder = bert_text_encoder
        self.dropout = nn.Dropout(p=dropout)
        bert_hidden_dim = bert_text_encoder.config.hidden_size

        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(bert_hidden_dim, bert_hidden_dim),
            nn.Dropout(p=dropout),
            nn.ReLU(),
            nn.Linear(bert_hidden_dim, 1),
        )


    def forward(self, inputs, attention_mask,):
        last_hidden_states = self.bert_text_encoder(inputs, attention_mask=attention_mask,
                                                    return_dict=True)['last_hidden_state']
        text_cls_embeddings = torch.stack([elem[0, :] for elem in last_hidden_states])
        proba = self.classifier(text_cls_embeddings)
        return proba

In [26]:
# model_name="roberta-base"
model_name="sentence-transformers/all-mpnet-base-v2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
bert_model = AutoModel.from_pretrained(model_name)

freeze_embeddings = True
if freeze_embeddings:
    for param in bert_model.embeddings.parameters():
        param.requires_grad = False

freeze_layer_count = 5
if freeze_layer_count > 0:
    for layer in bert_model.encoder.layer[:freeze_layer_count]:
        for param in layer.parameters():
            param.requires_grad = False

print("# Trainable params: ", sum(p.numel() for p in bert_model.parameters() if p.requires_grad))

# Trainable params:  50206080


In [27]:
N_EPOCHS = 5
DROPOUT = 0.2

bert_simple_clf = BertSimpleClassifier(bert_model, dropout=DROPOUT).to(device)
optimizer = optim.Adam(bert_simple_clf.parameters(), lr=3e-5)
criterion = nn.BCEWithLogitsLoss().to(device)

In [28]:
bert_simple_clf_trained = BertSimpleClassifier(bert_model, dropout=DROPOUT).to(device)
bert_simple_clf_trained.load_state_dict(torch.load('best-val-text_only_baseline.pt'))

<All keys matched successfully>

#### Gettings embeddings for entities of nodes and edges

In [29]:
train_dev_df["label"] = train_dev_df["correct"].astype(np.float32)
train_dev_df["graph"] = train_dev_df["graph"].apply(eval)
train_df_filtered_emb = train_dev_df.copy()
train_df_filtered_emb = train_df_filtered_emb.drop_duplicates(subset=["question", "answerEntity"], keep="first")

In [None]:
# convert None in labels to [MASK]
def none2mask(row):
    nodes = row['nodes']
    edges = row['links']
    for node in nodes:
        if node['label'] is None:
            node['label'] = '[MASK]'
    for edge in edges:
        if edge['label'] is None:
            edge['label'] = '[MASK]'
    return row

train_df_filtered_emb['graph'] = train_df_filtered_emb['graph'].apply(none2mask)

# create columns with labels of nodes and links (add [EMPTY] to empty list)
def graph_labels(row, edges=False):
    labels = []
    if edges:
        data = row['links']
    else:
        data = row['nodes']
    for item in data:
        labels.append(item['label'])
    if len(labels)==0:
        labels.append('[EMPTY]')
    return labels


train_df_filtered_emb['node_labels'] = train_df_filtered_emb['graph'].apply(graph_labels)
train_df_filtered_emb['edge_labels'] = train_df_filtered_emb['graph'].apply(graph_labels, edges=True)     

In [None]:
# helper func 
def create_embs(model, tokenizer, data):
    data_token = tokenizer(data, max_length=128, padding="max_length", truncation="only_first", return_tensors="pt")
    
    model.eval()
    with torch.no_grad():
        input_ids = data_token["input_ids"].to(device)
        attention_mask = data_token["attention_mask"].to(device)
        outputs = model.bert_text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        outputs = outputs.last_hidden_state.mean(dim=1)
    return outputs


# create columns with embeddings in format suitable for dgl.data.CSVDataset
def label_embs(row):
    outputs = create_embs(bert_simple_clf_trained, tokenizer, row)
    outputs = outputs.cpu().detach().numpy()
    res = []
    for output in outputs:
        res.append(",".join(map(str, output)))
    return res

train_df_filtered_emb['node_embs'] = train_df_filtered_emb['node_labels'].apply(label_embs)
train_df_filtered_emb['edge_embs'] = train_df_filtered_emb['edge_labels'].apply(label_embs)


In [None]:
train_df_filtered_emb.iloc[-1,-3]
# функция для преобразования данных графа в csv
def graph2node_csv(graph, id_graph, node_embs, edge_embs, edges=False):
    output = []
    if edges:
        # one node cases (without edges) in order to avoid errors while forming dgl dataset
        if len(graph['links']) == 0:
            output.append(
                {
                    'graph_id': id_graph,
                    'src_id': None,
                    'dst_id': None,
                    'feat': ' '
                }
            )
        for i, item in enumerate(graph['links']):
            output.append(
                {
                    'graph_id': id_graph,
                    'src_id': item['source'],
                    'dst_id': item['target'],
                    'feat': edge_embs[i]
                }
            )
    else:
        for i, item in enumerate(graph['nodes']):
            output.append(
                {
                    'graph_id': id_graph,
                    'node_id': item['id'],
                    'feat': node_embs[i]
                }
            )
    return output


train_df_filtered_emb['node_csv'] = train_df_filtered_emb.apply(lambda x: graph2node_csv(x['graph'], x['sample_id'], x['node_embs'], x['edge_embs'], edges=False), axis=1)
train_df_filtered_emb['edge_csv'] = train_df_filtered_emb.apply(lambda x: graph2node_csv(x['graph'], x['sample_id'], x['node_embs'], x['edge_embs'], edges=True), axis=1)

In [None]:
lst_of_nodes = train_df_filtered_emb['node_csv'].tolist()
flattened_list_nodes = [item for sublist in lst_of_nodes for item in sublist]
node_df = pd.DataFrame.from_records(flattened_list_nodes)

lst_of_edges = train_df_filtered_emb['edge_csv'].tolist()
flattened_list_edges = [item for sublist in lst_of_edges for item in sublist]
edge_df = pd.DataFrame.from_records(flattened_list_edges)

In [None]:
# delete one noode graphs for this time 
edge_df.dropna(how='any', axis=0, inplace=True)
node_df = node_df[node_df['graph_id'].isin(edge_df.graph_id.tolist())]
graph_df = train_df_filtered_emb.rename(columns={'sample_id':'graph_id'}).loc[:, ['graph_id', 'label']]
graph_df = graph_df[graph_df['graph_id'].isin(edge_df.graph_id.tolist())]

In [None]:
edge_df.to_csv('../data/dataset/edges.csv', index=False, na_rep=' ')
node_df.to_csv('../data/dataset/nodes.csv', index=False)

# train_df_filtered_emb["label"] = train_df_filtered_emb["correct"].astype(np.float32)
# train_df_filtered_emb.rename(columns={'sample_id':'graph_id'}).loc[:, ['graph_id', 'label']].to_csv('../data/dataset/graphs.csv', index=False)

graph_df.to_csv('../data/dataset/graphs.csv', index=False)

In [47]:
# create dgl dataset

dataset = dgl.data.CSVDataset('../data/dataset/')

Done loading data from cached files.


In [48]:
len(dataset)

35183