In [None]:
from torch_geometric.datasets import KarateClub
import torch
from torch_geometric.nn import GAE, GCNConv
from torch.nn import Parameter
import torch.nn.functional as F
from torch_geometric.transforms import RandomLinkSplit

device = 'cuda' if torch.cuda.is_available() else 'cpu'
dataset = KarateClub()
data = dataset[0].to(device)
transform = RandomLinkSplit(is_undirected=True, split_labels=True)
train_data, val_data, test_data = transform(data)
train_data

Data(x=[34, 34], edge_index=[2, 112], y=[34], train_mask=[34], pos_edge_label=[56], pos_edge_label_index=[2, 56], neg_edge_label=[56], neg_edge_label_index=[2, 56])

In [None]:
class GCNEncoder(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = GCNConv(data.num_features, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, hidden_channels)
        self.reset_parameters()

    def reset_parameters(self):
        self.conv1.reset_parameters()
        self.conv2.reset_parameters()

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu_()
        x = F.dropout(x, p=0.2, training=self.training)
        x = self.conv2(x, edge_index)
        return x

model = GCNEncoder(hidden_channels=16).to(device)
output = model(data.x, data.edge_index)
output.size()

torch.Size([34, 16])

In [None]:
class DistMultDecoder(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, z, edge_index, sigmoid: bool = True):
        z_src, z_dst = z[edge_index[0].long()], z[edge_index[1].long()]
        z_x = z_src * z_dst
        out = torch.sum(z_x, dim=1)
        return torch.sigmoid(out) if sigmoid else out
    
model = GAE(GCNEncoder(hidden_channels=16), DistMultDecoder()).to(device)
model.reset_parameters()
model(data.x, data.edge_index)

In [80]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)


def train():
    model.train()
    optimizer.zero_grad()
    z = model.encode(train_data.x, train_data.edge_index)
    train_loss = model.recon_loss(z, train_data.pos_edge_label_index, train_data.neg_edge_label_index)
    train_loss.backward()
    optimizer.step()
    return train_loss

@torch.no_grad()
def test():
    model.eval()
    z = model.encode(test_data.x, test_data.edge_index)
    train_acc = model.test(z, test_data.pos_edge_label_index, test_data.neg_edge_label_index)
    return train_acc

In [82]:
for epoch in range(1, 3):
    train_loss = train()
    print(f'Epoch: {epoch:02d}, Loss: {train_loss:.4f}')

    test_acc = test()
    print(f'Test Acc: {test_acc[0]:.4f}, Test AUC: {test_acc[1]:.4f}')

Epoch: 01, Loss: 1.3509
Test Acc: 0.6978, Test AUC: 0.7034
Epoch: 02, Loss: 1.3379
Test Acc: 0.6311, Test AUC: 0.6578


In [3]:
import os
import json
import numpy as np
import torch
from torch_geometric.data import Data
from sklearn.metrics.pairwise import cosine_similarity

def load_json(filename):
    """Đọc dữ liệu từ file JSON."""
    with open(filename, 'r', encoding='utf-8') as f:
        return json.load(f)

def calc_sentence_sim(s1, s2):
    """Tính độ tương đồng giữa hai câu dựa trên từ chung."""
    s1 = s1.split()
    s2 = s2.split()
    if len(s1) == 0 or len(s2) == 0:
        return 0  # Tránh chia cho 0
    return len(set(s1) & set(s2)) / (np.log(len(s1) + 1) + np.log(len(s2) + 1))

def prepare_data(data_dir):
    graphs = []
    labels = []
    
    in_files = sorted([f for f in os.listdir(data_dir) if f.endswith('.in')])
    total_files = len(in_files)
    
    for idx, in_file in enumerate(in_files):
        print(f"Processing file {idx+1}/{total_files}")  # In ra tiến độ xử lý
        
        current_positive_samples = 0
        current_negative_samples = 0
        
        base_name = in_file.replace('.in', '')
        label_file = os.path.join(data_dir, f"{base_name}.label")
        
        if not os.path.exists(label_file):
            print(f"Warning: Missing label file for {in_file}, skipping...")
            continue
        
        in_data = load_json(os.path.join(data_dir, in_file))
        label_data = load_json(label_file)
        
        correct_citations = set(label_data["correct_citation"])
        citation_ids = in_data["citation_candidates"]
        
        try:
            embedding_sentences = load_json(f'./specter_embeddings_task1/{base_name}.json')["sentences"]
            text_sentences = load_json(f'./tachcautask1/tachcautask1/{base_name}.sen')["sentences"]
        except Exception as e:
            print(f"Warning: Error loading embeddings or text sentences for {base_name}: {e}")
            continue
        
        for cid in citation_ids:
            if current_positive_samples == len(correct_citations) and current_negative_samples >= 3:
                break
            
            label = 1 if cid in correct_citations else 0
            labels.append(label)
            if label == 1:
                current_positive_samples += 1
            else:
                current_negative_samples += 1
            
            try:
                candidate_vec = load_json(f"./candidates_storage_vec/{cid}.candidate")
                candidate_text = load_json(f"./candidates_storage/{cid}.candidate")
            except FileNotFoundError:
                print(f"Warning: Missing file {cid}.candidate, skipping...")
                continue
            
            title_embedding = np.array(candidate_vec['title'])
            abstract_embeddings = np.array(candidate_vec['abstract'])
            
            abstract_texts = candidate_text['abstract']
            title_text = candidate_text['title']
            
            nodes = np.vstack([embedding_sentences, title_embedding, abstract_embeddings])
            num_nodes = nodes.shape[0]
            
            similarity_matrix = cosine_similarity(nodes)
            
            edges = []
            edge_weights = []
            
            num_source_sentences = len(embedding_sentences)
            num_abstract_sentences = len(abstract_embeddings)
            title_idx = num_source_sentences
            offset = num_source_sentences + 1
            
            for i in range(num_nodes):
                for j in range(num_nodes):
                    if i != j:
                        edges.append([i, j])
                        
                        window_flag = 0
                        if (0 <= i < num_source_sentences and j == i + 1) or \
                           (offset <= i < offset + num_abstract_sentences and j == i + 1):
                            window_flag = 1
                        
                        if i < num_source_sentences:
                            text_i = text_sentences[i] if i < len(text_sentences) else ""
                        elif i == title_idx:
                            text_i = title_text
                        else:
                            text_i = abstract_texts[i - offset]
                        
                        if j < num_source_sentences:
                            text_j = text_sentences[j]
                        elif j == title_idx:
                            text_j = title_text
                        else:
                            text_j = abstract_texts[j - offset]
                        
                        text_sim = calc_sentence_sim(text_i, text_j)
                        
                        weight = [
                            similarity_matrix[i, j],
                            text_sim,
                            window_flag
                        ]
                        edge_weights.append(weight)
            
            graph_dict = {
                "nodes": nodes.astype(np.float32),
                "edges": np.array(edges, dtype=np.int64).T,
                "weights": np.array(edge_weights, dtype=np.float32)
            }
            graphs.append(graph_dict)
    
    return graphs, labels

def convert_to_pyg_format(graphs, labels):
    """Chuyển đổi danh sách graphs sang định dạng của PyTorch Geometric."""
    data_list = []
    for graph, label in zip(graphs, labels):
        x = torch.tensor(graph['nodes'], dtype=torch.float)          # Node features
        edge_index = torch.tensor(graph['edges'], dtype=torch.long)    # Edge connectivity
        edge_attr = torch.tensor(graph['weights'], dtype=torch.float)  # Edge weights
        
        data = Data(
            x=x,
            edge_index=edge_index,
            edge_attr=edge_attr,
            y=torch.tensor([label], dtype=torch.long)
        )
        data_list.append(data)
    return data_list

def process_and_save_data(data_dir, processed_file):
    """Xử lý dữ liệu thô và lưu dataset đã xử lý (dưới dạng list of dictionaries) vào file."""
    graphs, labels = prepare_data(data_dir)
    dataset = convert_to_pyg_format(graphs, labels)
    
    # Chuyển đổi các Data object thành dictionary để tránh lỗi unpickle
    dataset_dict = []
    for data in dataset:
        dataset_dict.append({
            "x": data.x,
            "edge_index": data.edge_index,
            "edge_attr": data.edge_attr,
            "y": data.y,
        })
    
    torch.save(dataset_dict, processed_file)
    print(f"Processed dataset saved to {processed_file}")
    return dataset_dict

In [18]:
import pickle

dataset = [{
    'x': torch.randn(3, 768),
    'edge_index': torch.tensor([[0, 1, 2], [1, 2, 0]], dtype=torch.long),
    'edge_attr': torch.randn(3, 3),
    'y': torch.tensor([0])
}, {
    'x': torch.randn(4, 768),
    'edge_index': torch.tensor([[0, 1, 1, 2], [1, 0, 2, 3]], dtype=torch.long),
    'edge_attr': torch.randn(4, 3),
    'y': torch.tensor([1])
}]

for data in dataset:
    data_json = {
        "x": data['x'].tolist(),
        "edge_index": data['edge_index'].tolist(),
        "edge_attr": data['edge_attr'].tolist(),
        "y": data['y'].tolist(),
    }

    with open(f"data.json", 'a') as f:
        f.write(json.dumps(data_json) + '\n')

In [None]:
from torch_geometric.data import Dataset

class MyOwnDataset(Dataset):
    def __init__(self, root=None, transform=None, pre_transform=None, pre_filter=None, dataset=None):
        super().__init__(root, transform, pre_transform, pre_filter)
        self.dataset = dataset

    def len(self):
        return len(self.dataset)

    def get(self, idx):
        data = self.dataset[idx]
        return Data(
            x=torch.tensor(data['x'], dtype=torch.float),
            edge_index=torch.tensor(data['edge_index'], dtype=torch.long),
            edge_attr=torch.tensor(data['edge_attr'], dtype=torch.float),
            y=torch.tensor(data['y'], dtype=torch.long)
        )


In [50]:
from datasets import load_dataset
from torch_geometric.loader import DataLoader

dataset = load_dataset("json", data_files="data.json", split="train")
dataset = dataset.train_test_split(train_size=0.8)
train_dataset = dataset['train']
test_dataset = dataset['test']
graph_dataset = MyOwnDataset(dataset=train_dataset)
test_data = MyOwnDataset(dataset=test_dataset)
loader = DataLoader(graph_dataset, batch_size=2, shuffle=True)
for batch in loader:
    print(batch)

Dataset({
    features: ['x', 'edge_index', 'edge_attr', 'y'],
    num_rows: 1
})
DataBatch(x=[3, 768], edge_index=[2, 3], edge_attr=[3, 3], y=[1], batch=[3], ptr=[2])
