In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel
from torch_geometric.nn import GATConv
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# class TextGraphDataset(torch.utils.data.Dataset):
#     def __init__(self, texts, edge_indices, labels, tokenizer_name='roberta-base'):
#         """
#         texts: 句子列表
#         edge_indices: 关系列表
#         labels: 标签列表
#         tokenizer_name: 使用的预训练tokenizer名称
#         """
#         self.tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name)
#         self.texts = texts
#         self.edge_indices = edge_indices
#         self.labels = labels
# 
#     def __len__(self):
#         return len(self.texts)
# 
#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         edge_index = self.edge_indices[idx]
#         label = self.labels[idx]
# 
#         inputs = self.tokenizer(text, padding='max_length', truncation=True, max_length=512, return_tensors="pt")
#         input_ids = inputs['input_ids'].squeeze(0)  
#         attention_mask = inputs['attention_mask'].squeeze(0)
# 
#         return input_ids, attention_mask, edge_index, label
    
    
class GATConvWithAttention(GATConv):
    def forward(self, x, edge_index, edge_attr=None, size=None, return_attention_weights=True):
        out, attention_weights = super().forward(x, edge_index, edge_attr, size, return_attention_weights)
        return out, attention_weights


class RobertaGAT(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RobertaGAT, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.gat = GATConvWithAttention(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, edge_index):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
    
        gat_output, attention_weights = self.gat(sentence_embeddings, edge_index)
        return F.log_softmax(gat_output, dim=1), attention_weights
    

# 初始化模型
model = RobertaGAT("roberta-base", num_classes=5)

In [None]:
import itertools
# 图关系
# 训练集(71251,4519)
import pandas as pd
from llama_cpp import Llama
llama = Llama(model_path='./llama-2-7b.Q4_K_M.gguf', embedding=True)

def get_edge_index(path, num):
        # df = pd.read_csv('data/train.tsv', sep='\t')  # 训练集(71251,4519)
        # df = pd.read_csv('data/test.tsv', sep='\t')   # 测试集合(15250,965)
        # df = pd.read_csv("data/validation.csv")  # 验证集 (16073,1028)
    df = pd.read_csv(path)
    relationship = []
    sentences_embedding = []
    abstracts_embedding = []
    # sentence_rel = []
    abstract_rel = []
    abstract = ''
    for i in range(0, len(df['label'])):
        if df['label'][i] == 'titile':
            sentence_embedding = llama.create_embedding(input=df["text"][i])
            sentences_embedding.append([i ,sentence_embedding.get('data')[0].get('embedding')])
            continue
        abstract += df['text'][i]
        relationship.append([i, num])
        if df['label'][i] == 'conclusions' and df['label'][i+1] == 'background':
            abstract_embedding = llama.create_embedding(input=abstract)
            abstracts_embedding.append([i, abstract_embedding.get('data')[0].get('embedding')])
            abstract = ''
            num += 1
            continue
        relationship.append([i, i+1])
    abs_dic = {}
    for i, j in itertools.combinations(range(len(abstracts_embedding)), 2):
        cos = cosine_similarity([abstracts_embedding[i][1], abstracts_embedding[j][1]])
        abs_dic[(i,j)] = cos[0][1]
    y = 0
    for i in abs_dic.keys():
        if abs_dic.get(i) >= 0.8:
            abstract_rel.append([i[0], i[1]])
            y += 1 
    abstract_edge = pd.DataFrame(abstract_rel)
    abstract_edge.to_csv("abstract.csv")
    edge_index = torch.tensor(relationship)
    # sentence_edge_index = torch.tensor(relationship)
    abstract_edge_index = torch.tensor(abstract_rel)
    return edge_index, abstract_edge_index # sentence_edge_index,

In [None]:
from datasets import load_dataset, DatasetDict

# 加载数据集
dataset_train = load_dataset('csv', data_files='data/train.csv')
dataset_test = load_dataset('csv', data_files='data/test.csv')
dataset_valid = load_dataset('csv', data_files='data/validation.csv')
dataset = DatasetDict({'train': dataset_train, 'test': dataset_test, 'validation': dataset_valid})
# 获取关系
# train_rel = get_edge_index(path='data/train.csv', num=71251)
text_rel = get_edge_index(path='data/test.csv', num=15250)
# valid_rel = get_edge_index(path='data/validation.csv', num=16073)


llama_print_timings:        load time = 20052.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 38668.01 ms /   490 tokens (   78.91 ms per token,    12.67 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 38677.88 ms

llama_print_timings:        load time = 20052.54 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 47703.40 ms /   591 tokens (   80.72 ms per token,    12.39 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 47714.65 ms

llama_print_timings:        load time = 20052.54 ms
llama_print_timings:   

In [None]:
from transformers import RobertaTokenizer

def encode_batch(batch):
    return RobertaTokenizer(batch['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")

dataset = {split: dataset[split].map(encode_batch, batched=True) for split in dataset.keys()}

data_split = dataset['train'] 

input_ids = torch.stack(tuple(data_split['input_ids']))
attention_mask = torch.stack(tuple(data_split['attention_mask']))

# edge_index = train_rel


In [None]:
# 假设标签
labels = torch.tensor([0, 1], dtype=torch.long)

# 损失函数和优化器
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
num_epochs = 10
# 训练循环
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    output = model(input_ids, attention_mask, edge_index)
    loss = criterion(output, labels)
    loss.backward()
    optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")
