In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel
from torch_geometric.nn import GATConv
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import itertools
import pandas as pd
import numpy as np
import os
from llama_cpp import Llama
import csv

In [None]:
class GATConvWithAttention(GATConv):
    def forward(self, x, edge_index, edge_attr=None, size=None, return_attention_weights=True):
        out, attention_weights = super().forward(x, edge_index, edge_attr, size, return_attention_weights)
        return out, attention_weights

class RobertaGAT(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RobertaGAT, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.gat = GATConvWithAttention(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, edge_index):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
    
        gat_output, attention_weights = self.gat(sentence_embeddings, edge_index)
        return F.log_softmax(gat_output, dim=1), attention_weights
    
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encoded_dataset, edge_index):
        self.encoded_dataset = encoded_dataset
        self.edge_index = edge_index

    def __len__(self):
        return len(self.encoded_dataset)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encoded_dataset.items()}
        item['edge_index'] = self.edge_index
        return item
    

In [None]:
llama = Llama(model_path='./llama-2-7b.Q4_K_M.gguf', embedding=True, n_ctx=4096)

# 图关系
# 训练集(71251,4519)
# 测试集合(15250,965)
# 验证集 (16073,1028)
def get_sentence_rel(path, num):
    """
    以文章为单位，构建关系（abs_sentence-title）
    :param path: 
    :param num: 
    :return: 
    """
    df = pd.read_csv(path)
    relationship = []
    for i in range(0, len(df['label'])):
        if df["label"][i] != 5:
            relationship.append([i, num])
        if df['label'][i] == 4 and df['label'][i + 1] == 0:
            num += 1
            continue
    return relationship

def get_abstract_embedding(path, start):
    """
    Llama编码获取摘要embedding。处理结果为[[][]]
    :param path: 
    :param start: 
    """
    df = pd.read_csv(path)
    abstract = ''
    for i in range(start, len(df['label'])):
        abstract += df['text'][i]
        if df['label'][i] == 4 and df['label'][i + 1] == 0:
            abstract_embedding = llama.create_embedding(input=abstract).get('data')[0].get('embedding')
            np.save(f"./temp/abstract_embedding{i}.npy", abstract_embedding)
            with open('./data/abstract_embedding_test.csv', 'a', newline='', encoding='utf-8') as csvfile:
                writer = csv.writer(csvfile)
                writer.writerow([abstract, f'abstract_embedding{i}.npy'])
            abstract = ''
    tmp = []   
    files = os.listdir("./temp", )
    # 获取每个文件的完整路径
    full_paths = [os.path.join("./temp", file) for file in files]
    # 按创建时间对文件进行排序
    sorted_files = sorted(full_paths, key=os.path.getctime)
    for file in sorted_files:
        if file.endswith('.npy'):
            tmp.append(np.load(f'{file}', allow_pickle=True))
    np.array(tmp)
    np.save(f'./data/abstract_embedding_test.npy', tmp)


def cos_sim(a, b):
    return cosine_similarity([a, b])[0][1]

def get_paper_rel(array):
    """
    获取文章直接的关系（title-title）
    :param array: 
    :return: 
    """
    rel = []
    for i, j in itertools.combinations(range(len(array)), 2):
        cos = cos_sim(array[i], array[j])
        if cos >= 0.95:
            rel.append([i, j])
    return rel  

def get_edge_index(sen_rel, abs_rel):
    """
    构建图关系
    """
    return torch.tensor(sen_rel + abs_rel)

In [None]:
get_abstract_embedding(path='data/test.csv', start=183)
# get_abstract_embedding(path='data/validation.csv', start=0)
# get_abstract_embedding(path='data/train.csv', start=0)


llama_print_timings:        load time = 12309.34 ms
llama_print_timings:      sample time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings: prompt eval time = 55581.14 ms /   704 tokens (   78.95 ms per token,    12.67 tokens per second)
llama_print_timings:        eval time =     0.00 ms /     1 runs   (    0.00 ms per token,      inf tokens per second)
llama_print_timings:       total time = 55587.81 ms


In [26]:
from datasets import load_dataset, DatasetDict

# 加载数据集
dataset_train = load_dataset('csv', data_files='data/train.csv')
dataset_test = load_dataset('csv', data_files='data/test.csv')
dataset_valid = load_dataset('csv', data_files='data/validation.csv')
dataset = DatasetDict({'train': dataset_train, 'test': dataset_test, 'validation': dataset_valid})
# 获取边关系
# 训练集合关系
train_sen_rel = get_sentence_rel(path='data/train.csv', num=0)
train_abs_rel = get_abstract_embedding(path='data/train.csv', start=0)
train_paper_rel = get_paper_rel(train_abs_rel)
train_rel = get_edge_index(train_sen_rel, train_paper_rel)
# 测试集合关系
test_sen_rel = get_sentence_rel(path='data/test.csv', num=0)
test_abs_rel = get_abstract_embedding(path='data/test.csv', start=0)
test_paper_rel = get_paper_rel(test_abs_rel)
test_rel = get_edge_index(test_sen_rel, test_paper_rel)
# 验证集合关系
valid_sen_rel = get_sentence_rel(path='data/validation.csv', num=0)
valid_abs_rel = get_abstract_embedding(path='data/validation.csv', start=0)
valid_paper_rel = get_paper_rel(valid_abs_rel)
valid_rel = get_edge_index(valid_sen_rel, valid_paper_rel)

KeyboardInterrupt: 

In [None]:
from transformers import RobertaTokenizer

def encode_batch(batch):
    return RobertaTokenizer(batch['text'], padding=True, truncation=True, max_length=512, return_tensors="pt")

dataset = {split: dataset[split].map(encode_batch, batched=True) for split in dataset.keys()}

data_split = dataset['train'] 

train_dataset = CustomDataset(dataset['train'], train_rel)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaGAT("roberta-base", num_classes=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 10

model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        edge_index = batch['edge_index'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask, edge_index)
        loss = criterion(output, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")
