In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel
from torch_geometric.nn import GATConv
from torch.utils.data import Dataset

In [35]:
class TextGraphDataset(torch.utils.data.Dataset):
    def __init__(self, texts, edge_indices, labels, tokenizer_name='roberta-base'):
        """
        texts: 句子列表
        edge_indices: 关系列表
        labels: 标签列表
        tokenizer_name: 使用的预训练tokenizer名称
        """
        self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
        self.texts = texts
        self.edge_indices = edge_indices
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        edge_index = self.edge_indices[idx]
        label = self.labels[idx]

        inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
        input_ids = inputs['input_ids'].squeeze(0)  # 去除批次维度
        attention_mask = inputs['attention_mask'].squeeze(0)

        return input_ids, attention_mask, edge_index, label
    
    
class GATConvWithAttention(GATConv):
    def forward(self, x, edge_index, edge_attr=None, size=None, return_attention_weights=True):
        out, attention_weights = super().forward(x, edge_index, edge_attr, size, return_attention_weights)
        return out, attention_weights


class RobertaGAT(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RobertaGAT, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.gat = GATConvWithAttention(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, edge_index):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
        
        gat_output, attention_weights = self.gat(sentence_embeddings, edge_index)
        return F.log_softmax(gat_output, dim=1), attention_weights

# 初始化模型
model = RobertaGAT("roberta-base", num_classes=5)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# 图关系
# 训练集(71251,4519)
import pandas as pd
def get_edge_index(path, num, type):
        # df = pd.read_csv('data/train.tsv', sep='\t')  # 训练集(71251,4519)
        # df = pd.read_csv('data/test.tsv', sep='\t')   # 测试集合(15250,965)
        # df = pd.read_csv("data/validation.csv")  # 验证集 (16073,1028)
    df = ''
    if type == 'csv':
        df = pd.read_csv(path)
    elif type == 'tsv':
        df = pd.read_csv(path, sep='\t')
    relationship = []
    for i in range(0, len(df['label'])):
        relationship.append([i, num])
        try:
            if df['label'][i] == 'conclusions' and df['label'][i+1] == 'background':
                num += 1
                continue
        except KeyError:
            break
        relationship.append([i, i+1])
    edge_index = torch.tensor(relationship)
    return edge_index

In [33]:
from datasets import load_dataset, DatasetDict

# 加载数据集
dataset_train = load_dataset('csv', data_files='data/train.tsv', delimiter='\t')
dataset_test = load_dataset('csv', data_files='data/test.tsv', delimiter='\t')
dataset_valid = load_dataset('csv', data_files='data/validation.csv')
dataset = DatasetDict({'train': dataset_train, 'test': dataset_test, 'validation': dataset_valid})
# 获取关系
train_rel = get_edge_index(path='data/train.tsv', num=71251, type='tsv')
text_rel = get_edge_index(path='data/test.tsv', num=15250, type='tsv')
valid_rel = get_edge_index(path='data/validation.csv', num=16073, type='csv')

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [55]:
from transformers import RobertaTokenizer

def encode_batch(batch):
    return RobertaTokenizer(batch['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")

dataset = {split: dataset[split].map(encode_batch, batched=True) for split in dataset.keys()}

data_split = dataset['train'] 

input_ids = torch.stack(tuple(data_split['input_ids']))
attention_mask = torch.stack(tuple(data_split['attention_mask']))

edge_index = train_rel


Map:   0%|          | 0/71251 [00:00<?, ? examples/s]

Map:   0%|          | 0/15250 [00:00<?, ? examples/s]

Map:   0%|          | 0/16073 [00:00<?, ? examples/s]

In [59]:
# 假设标签
labels = torch.tensor([0, 1], dtype=torch.long)

# 损失函数和优化器
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
# 训练循环
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(input_ids, attention_mask, edge_index)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

