In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel
from torch_geometric.nn import GATConv
from torch.utils.data import DataLoader
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
from llama_cpp import Llama
import csv
from torch.utils.tensorboard import SummaryWriter
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
class GATConvWithAttention(GATConv):
    def forward(self, x, edge_index, edge_attr=None, size=None, return_attention_weights=True):
        out, attention_weights = super().forward(x, edge_index, edge_attr, size, return_attention_weights)
        return out, attention_weights

class RobertaGAT(nn.Module):
    def __init__(self, roberta_model_name, num_classes):
        super(RobertaGAT, self).__init__()
        self.roberta = RobertaModel.from_pretrained(roberta_model_name)
        self.gat = GATConvWithAttention(self.roberta.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask, edge_index):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sentence_embeddings = outputs.last_hidden_state[:, 0, :]
        print(sentence_embeddings, edge_index, len(sentence_embeddings), len(edge_index))
    
        gat_output, attention_weights = self.gat(sentence_embeddings, edge_index)
        return F.log_softmax(gat_output, dim=1), attention_weights
    
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encoded_dataset, edge_index):
        self.encoded_dataset = encoded_dataset
        self.edge_index = edge_index

    def __len__(self):
        return len(self.encoded_dataset)

    def __getitem__(self, idx):
        label = torch.tensor(self.encoded_dataset[idx]['label'], dtype=torch.long)
        input_ids = torch.tensor(self.encoded_dataset[idx]['input_ids'], dtype=torch.long)
        attention_mask = torch.tensor(self.encoded_dataset[idx]['attention_mask'], dtype=torch.long)

        edge_index = self.edge_index[idx]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': label,  
            'edge_index': edge_index
        }

In [4]:
llama = Llama(model_path='./llama-2-7b.Q4_K_M.gguf', embedding=True, n_ctx=2048, n_gpu_layers=30)

# 图关系
# 训练集(71251,4519)
# 测试集合(15250,965)
# 验证集 (16073,1028)
def get_sentence_rel(path, num):
    """
    以文章为单位，构建关系（abs_sentence-title）
    :param path: 
    :param num: 
    :return: 
    """
    df = pd.read_csv(path, encoding="GB2312")
    relationship = []
    for i in range(0, len(df['label'])):
        if df['label'][i] == 4 and (df['label'][i + 1] == 0 or df['label'][i + 1] == 5):
            relationship.append([i, i+1])
            num += 1
            continue
        if df["label"][i] != 5:
            relationship.append([i, num])
            relationship.append([i, i+1])

    return relationship

def get_abstract_embedding(path, start, type):
    """
    Llama编码获取摘要embedding。处理结果为[[][]]
    :param type: 
    :param path: 
    :param start: 
    """
    df = pd.read_csv(path, encoding='GB2312')
    abstract = ''
    for i in range(start, len(df['label'])):
        try:
            abstract += df['text'][i]
            if df['label'][i] == 4 and df['label'][i + 1] == 0:
                abstract_embedding = llama.create_embedding(input=abstract).get('data')[0].get('embedding')
                np.save(f"./temp/abstract_embedding{i}.npy", abstract_embedding)
                with open(f'./data/abstract_embedding_{type}.csv', 'a', newline='', encoding='utf-8') as csvfile:
                    writer = csv.writer(csvfile)
                    writer.writerow([abstract, f'abstract_embedding{i}.npy'])
                abstract = ''
        finally:
            continue
    tmp = []   
    files = os.listdir("./temp", )
    # 获取每个文件的完整路径
    full_paths = [os.path.join("./temp", file) for file in files]
    # 按创建时间对文件进行排序
    sorted_files = sorted(full_paths, key=os.path.getctime)
    for file in sorted_files:
        if file.endswith('.npy'):
            tmp.append(np.load(f'{file}', allow_pickle=True))
    np.array(tmp)
    np.save(f'./data/abstract_embedding_{type}.npy', tmp)


def cos_sim(a, b):
    return cosine_similarity([a, b])[0][1]

def get_paper_rel(array, num):
    """
    获取文章直接的关系（title-title）
    :param array: 
    :return: 
    """
    rels = []
    for i in range(0, len(array)):
        for j in range(i+1, len(array)):
                cos = cos_sim(array[i], array[j])
                if cos >= 0.93:
                   rels.append([num + i, num + j])
    return rels 

def get_edge_index(sen_rel, abs_rel):
    """
    按节点，构建图关系
    """
    df = pd.read_csv('data/test.csv')
    rels = []
    for i in range(len(df['label'])):
        rel = []
        for j in (sen_rel + abs_rel):
            if i in j:
                rel.append(j)            
        rels.append(rel)
    return rels

AVX = 1 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 0 | VSX = 0 | 


In [None]:
# get_abstract_embedding(path='data/test.csv', start=0, type='test')
# get_abstract_embedding(path='data/validation.csv', start=0, type='valid')
# get_abstract_embedding(path='data/train.csv', start=0, type='train')

In [5]:
from datasets import load_dataset, DatasetDict

# 加载数据集
dataset_train = load_dataset('csv', data_files='data/train.csv', encoding='utf-8')
dataset_test = load_dataset('csv', data_files='data/test.csv', encoding='utf=8')
dataset_valid = load_dataset('csv', data_files='data/validation.csv', encoding='utf-8')
dataset = DatasetDict({'train': dataset_train, 'test': dataset_test, 'validation': dataset_valid})

In [135]:
# 获取边关系
# 训练集合边关系
train_sen_rel = get_sentence_rel(path='data/train.csv', num=71251)
train_abs_rel = np.load('data/abstract_embedding_train.npy')
train_paper_rel = get_paper_rel(train_abs_rel, num=71251)
train_rel = get_edge_index(train_sen_rel, train_paper_rel)

In [6]:
# 测试集合边关系
test_sen_rel = get_sentence_rel(path='data/test.csv', num=15250)
test_abs_rel = np.load('data/abstract_embedding_test.npy')
test_paper_rel = get_paper_rel(test_abs_rel, num=15250)
test_rel = get_edge_index(test_sen_rel, test_paper_rel)

In [None]:
# 验证集合边关系
valid_sen_rel = get_sentence_rel(path='data/validation.csv', num=16073)
valid_abs_rel = np.load('data/abstract_embedding_validation.npy')
valid_paper_rel = get_paper_rel(valid_abs_rel, num=16073)
valid_rel = get_edge_index(valid_sen_rel, valid_paper_rel)

In [7]:
from transformers import RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
def encode_batch(batch):
    return tokenizer(batch['text'], padding=True, truncation=True, max_length=2048, return_tensors="pt")

dataset = {split: dataset[split].map(encode_batch, batched=True) for split in dataset.keys()}

Map:   0%|          | 0/16214 [00:00<?, ? examples/s]

Map:   0%|          | 0/17100 [00:00<?, ? examples/s]

In [8]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)
    labels = torch.stack([item['labels'] for item in batch])
    
    cum_num_nodes = 0  # 累积节点数
    edge_index = []
    for item in batch:
        # 如果您确定每个样本的edge_index总是非空，可以直接处理
        edge_index_tensor = torch.tensor(item['edge_index'], dtype=torch.long) if isinstance(item['edge_index'], list) else item['edge_index']

        # 调整edge_index的节点索引
        edge_index_adjusted = edge_index_tensor + cum_num_nodes
        edge_index.append(edge_index_adjusted.t())  # 转置以确保两行

        # 更新累积节点数
        num_nodes = item['input_ids'].size(0)  # 假设input_ids的长度等于节点数
        cum_num_nodes += num_nodes

    # 合并edge_index
    edge_index = torch.cat(edge_index, dim=1)
    print(edge_index)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'labels': labels,
        'edge_index': edge_index
    }

In [9]:
# train_data_split = dataset['train'] 
test_data_split = dataset['test']
# valid_data_split = dataset['validation']
# train_dataset = CustomDataset(train_data_split['train'], train_rel)
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = CustomDataset(test_data_split['train'], test_rel)
# print(len(test_rel))
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
# print(len(test_loader))
# valid_dataset = CustomDataset(valid_data_split['train'], train_rel)
# valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RobertaGAT("roberta-base", num_classes=5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 10

writer = SummaryWriter('log/robert-gat')

def validate(model, valid_loader, criterion, device):
    model.eval() 
    total_loss = 0
    total_correct = 0
    total_samples = 0

    with torch.no_grad():  
        for batch in valid_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            edge_index = batch['edge_index'].to(device)
            labels = batch['labels'].to(device)

            output, _ = model(input_ids, attention_mask, edge_index)
            loss = criterion(output, labels)
            total_loss += loss.item()

            _, predicted = torch.max(output, 1)
            total_correct += (predicted == labels).sum().item()
            total_samples += labels.size(0)

    avg_loss = total_loss / len(valid_loader)
    accuracy = total_correct / total_samples
    return avg_loss, accuracy

In [12]:
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        edge_index = batch['edge_index'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        output = model(input_ids, attention_mask, edge_index)
        loss = criterion(output, labels)
        print(loss)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(test_loader)
    writer.add_scalar('Training loss', avg_loss, epoch)
    # avg_val_loss, val_accuracy = validate(model, valid_loader, criterion, device)
    # writer.add_scalar('Validation Loss', avg_val_loss, epoch)
    # writer.add_scalar('Validation Accuracy', val_accuracy, epoch)

    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(test_loader)}")

tensor([[10304, 10305, 10305,  7439,  7440,  7440, 13841, 13842, 13842,  5588,
          5589,  5590,  5591,  5592,  5593,  5594,  5595,  5596,  5597,  5598,
          5599,  5600,  5601,  5602,  5603,  5604,  5711,  5712,  5712, 11310,
         11311, 11311, 10214, 10215, 10215,  2726,  2727,  2727,  3340,  3341,
          3341, 14335, 14336, 14336,  7171,  7172,  7172, 14528, 14529, 14529,
          8325,  8326,  8326, 10706, 10707, 10707,  3254,  3255,  8571,  8572,
          8572, 11403, 11404, 11404, 10246, 10247, 10247,  5762,  5763,  5763,
         17848, 17849, 17849, 12417, 12418,  9393,  9394,  9394, 10705, 10706,
         10707, 10708, 10709, 10710, 10711, 10712, 10713, 10714, 10715, 10716,
         10717, 10718, 10719, 10720, 10721, 14214, 14215, 14216, 14217, 14218,
         14219, 14220, 14221, 14222, 14223, 14224, 14225, 14226, 14227, 14228,
          4816,  4817, 15567, 15568, 15568,  8192,  8193,  8193, 17717, 17718,
         17718,  8119,  8120,  8120, 10105, 10106, 1

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
